分布式 PyTorch 初探

Important

Import 相关模块#

1
import os
2
import time
3
import torch
4
import torch.distributed as dist

环境初始化#

1
# 1. Read rank/world_size from environment (set by torchrun/mpirun)
2
rank = int(os.environ.get("RANK", 0))
3
world_size = int(os.environ.get("WORLD_SIZE", 1))
4
local_rank = int(os.environ.get("LOCAL_RANK", 0))
5

6
# 2. Device setup
7
device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
8
if torch.cuda.is_available():
9
    torch.cuda.set_device(device)
10

11
# 3. Initialize process group
12
backend = "nccl" if torch.cuda.is_available() else "gloo"
13
dist.init_process_group(backend=backend, rank=rank, world_size=world_size)

集合通信操作#

AllReduce#

1
### All-reduce
2
dist.barrier()  # Waits for all processes to get to this point (in this case, for print statements)
3

4
data = torch.tensor([0., 1, 2, 3], device=device) + rank  # Both input and output
5

6
print(f"Rank {rank} [before all-reduce]: {data}", flush=True)
7
dist.all_reduce(tensor=data, op=dist.ReduceOp.SUM, async_op=False)  # Modifies tensor in place
8
print(f"Rank {rank} [after all-reduce]: {data}", flush=True)

ReduceScatter#

1
### Reduce-scatter
2
dist.barrier()
3

4
input_ts = torch.arange(world_size, dtype=torch.float32, device=device) + rank  # Input
5
# output_ts = torch.empty(1, device=device)  # Allocate output
6
output_ts = torch.zeros(1, device=device)  # Allocate output
7

8
print(f"Rank {rank} [before reduce-scatter]: input = {input_ts}, output = {output_ts}", flush=True)
9
dist.reduce_scatter_tensor(output=output_ts, input=input_ts, op=dist.ReduceOp.SUM, async_op=False)
10
print(f"Rank {rank} [after reduce-scatter]: input = {input_ts}, output = {output_ts}", flush=True)

AllGather#

1
### All-gather
2
dist.barrier()
3

4
input_ts = output_ts  # Input is the output of reduce-scatter
5
# output_ts = torch.empty(world_size, device=device)  # Allocate output
6
output_ts = torch.zeros(world_size, device=device)  # Allocate output
7

8
print(f"Rank {rank} [before all-gather]: input = {input_ts}, output = {output_ts}", flush=True)
9
dist.all_gather_into_tensor(output_tensor=output_ts, input_tensor=input_ts, async_op=False)
10
print(f"Rank {rank} [after all-gather]: input = {input_ts}, output = {output_ts}", flush=True)

运行结果#

在一个包含 4 个 GPU 的节点上运行的结果如下：

在两个节点，每个节点包含 4 个 GPU，上运行的结果如下：

带宽基准测试#

创建 Tensor#

1
# Create tensor
2
data = torch.randn(num_elements, device=cuda_if_available(rank))

Warmup#

1
# Warmup
2
dist.all_reduce(tensor=data, op=dist.ReduceOp.SUM, async_op=False)
3
torch.cuda.synchronize()  # Wait for CUDA kernels to finish
4
dist.barrier()            # Wait for all the processes to get here

记录时间#

1
# Perform all-reduce
2
start_time = time.time()
3
dist.all_reduce(tensor=data, op=dist.ReduceOp.SUM, async_op=False)
4
torch.cuda.synchronize()  # Wait for CUDA kernels to finish
5
dist.barrier()            # Wait for all the processes to get here
6
end_time = time.time()
7

8

9
def render_duration(duration: float) -> str:
10
    if duration < 1e-3:
11
        return f"{duration * 1e6:.2f}us"
12
    if duration < 1:
13
        return f"{duration * 1e3:.2f}ms"
14
    return f"{duration:.2f}s"
15

16

17
duration = end_time - start_time
18
print(f"[all_reduce] Rank {rank}: all_reduce(world_size={world_size}, num_elements={num_elements}) took {render_duration(duration)}", flush=True)

计算带宽#

1
# Measure the effective bandwidth
2
dist.barrier()
3
size_bytes = data.element_size() * data.numel()
4
sent_bytes = size_bytes * 2 * (world_size - 1)  # 2x because send + receive, world_size-1 steps in all-reduce
5
total_duration = world_size * duration
6
bandwidth = sent_bytes / total_duration
7
print(f"[all_reduce] Rank {rank}: all_reduce measured bandwidth = {round(bandwidth / 1024**3)} GB/s", flush=True)

运行结果#

在一个搭载 4 台 GPU 的节点上运行结果如下：

值得注意的是理论最大带宽是 200 GB/s，nvidia-smi nvlink -s 的输出如下：

释放资源#

1
# Cleanup
2
dist.barrier()
3
torch.distributed.destroy_process_group()

Import 相关模块#

环境初始化#

集合通信操作#

AllReduce#

ReduceScatter#

AllGather#

运行结果#

带宽基准测试#

创建 Tensor#

Warmup#

记录时间#

计算带宽#

运行结果#

释放资源#

支持与分享

目录