Repeating blocks
For resnet18 and resnet34, the repeating blocks are in such a way that the dashed shortcut only exist from the second group of repeaating blocks for the first block only. We'll need to use Pytorch's nn.ModuleList which as some might think is like a list, is not really a list, and here's why. PyTorch becomes "aware" of the nn.Modules defined in there and can track and update the states of those modules.
class RepeatingBlocks(nn.Module):
    def __init__(self, 
                in_block: int, out_channels: list[int],
                kernel_sizes: list[int], no_blocks: int, 
                dashed_shortcut=True):
        super(RepeatingBlocks, self).__init__()
        self.repeating_blocks = nn.ModuleList()
        for _ in range(no_blocks):
            self.repeating_blocks.append(
                ResidualBlock(in_block, out_channels, kernel_sizes, dashed_shortcut)
            )
            dashed_shortcut = False 
            in_block = out_channels[-1]
        self.repeating_blocks = nn.Sequential(*self.repeating_blocks)
    def forward(self, x):
        x = self.repeating_blocks(x)
        return x
The above class instantiation loops through the number of repeating blocks in each group for a ResNet. We know that if dashed shortcut starts a group, it is only for that first block for the group. Hence we disable it after. Also, the output channel of the previous block becomes the input channel of the next block.
Now, let's implement the ResNet18 architecture:
class Lambda(nn.Module):
    def __init__(self, func):
        super(Lambda, self).__init__()
        self.func = func
    def forward(self, x):
        return self.func(x)

resnet18 = nn.Sequential(
    nn.Conv2d(3, 64, 7, 2, 3), # you can also use keyword arguments
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3, 2, 1),
    RepeatingBlocks(64, [64, 64],[3, 3], 2, False),
    RepeatingBlocks(64, [128, 128],[3, 3], 2),
    RepeatingBlocks(128, [256, 256],[3, 3], 2),
    RepeatingBlocks(256, [512, 512],[3, 3], 2),
    nn.AvgPool2d(7, 1),
    Lambda(lambda x: x.view(x.shape[-1], -1)),
    nn.Linear(512, 1000),
    nn.Softmax(dim=1)
)
This implementation takes into consideration the image parameters as discussed in the paper. For the implementation of the ResNet34, it is as below:
resnet34 = nn.Sequential(
    nn.Conv2d(3, 64, 7, 2, 3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3, 2, 1),
    RepeatingBlocks(64, [64, 64],[3, 3], 3, False),
    RepeatingBlocks(64, [128, 128],[3, 3], 4),
    RepeatingBlocks(128, [256, 256],[3, 3], 6),
    RepeatingBlocks(256, [512, 512],[3, 3], 3),
    nn.AvgPool2d(7, 1),
    Lambda(lambda x: x.view(x.shape[-1], -1)),
    nn.Linear(512, 1000),
    nn.Softmax(dim=1)
)
Now for the next layers of ResNet, that is 50-layer, 101-layer and 152-layer, the block is not really a residual block, rather it is known as a bottleneck layer. This bottleneck has a projection shortcut for the first block of each group, and identity mappings for the rest of the blocks in that group. But again, as is the case with the residual block, the stride is set to 1 for the next bottlenecks in the group of bottleneck layers. Here's how the strides are distributed for the groups of bottleneck blocks for conv2_x, conv3_x, conv4_x and conv5_x.
I'm not entirely certain above this, but do correct me if I'm wrong.
~1x1 convolutions have padding 0, 3x3 convolutions have padding 1. This is actualized by k // 2 in the code.
~ conv2_x has strides of 1 for all of its blocks, while conv{3,4,5}_x have stride 2 only for the first block. This is done by passing stride parameter which is then reset to 1 after the first block.
~ the first block uses projection shortcuts while the rest use identity mappings in the skip connections.
class Bottleneck(nn.Module):
    def __init__(self, in_block: int, out_channels: list[int], 
            kernel_sizes: list[int], stride, first_block: bool):
        super(Bottleneck, self).__init__()
        in_skip = in_block 
        in_stride = stride
        self.layers = nn.ModuleList()
        for f, k in zip(out_channels, kernel_sizes):
            self.layers.extend([
                nn.Conv2d(in_block, f, k, stride, k // 2, bias=False),
                nn.BatchNorm2d(f),
                nn.ReLU()])
            in_block, stride = f, 1
        self.layers = self.layers[:-1]
        self.layers = nn.Sequential(*self.layers)
        self.skip_layer = None
        if first_block:
            self.skip_layer = nn.Sequential(
                nn.Conv2d(in_skip, f, 1, in_stride, 0, bias=False),
                nn.BatchNorm2d(f))
        else:
            self.skip_layer = nn.Identity()
    def forward(self, x):
        skip_output = self.skip_layer(x) 
        x = self.layers(x)
        x += skip_output
        return nn.ReLU()(x) # not weighted, can be called directly
Now onto the class for the repeating blocks for the bottleneck layer, this is what needs to be done.
class RepeatingBottleNecks(nn.Module):
    def __init__(self, 
                in_block: int, out_channels: list[int],
                kernel_sizes: list[int], no_blocks: int, 
                stride):
        super(RepeatingBottleNecks, self).__init__()
        self.repeating_blocks = nn.ModuleList()
        is_first = True
        for _ in range(no_blocks):
            self.repeating_blocks.append(
                Bottleneck(in_block, out_channels, kernel_sizes, stride, is_first)
            )
            stride, is_first = 1, False 
            in_block = out_channels[-1]
        self.repeating_blocks = nn.Sequential(*self.repeating_blocks)
    def forward(self, x):
        x = self.repeating_blocks(x)
With the above two custom classes inheriting from nn.Module implemented, now we can implement the 50-layer, 101-layer and 152-layer ResNets.
~ 50-layer ResNet
resnet50 = nn.Sequential(
    nn.Conv2d(3, 64, 7, 2, 3), 
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3, 2, 1),
    RepeatingBottleNecks(64, [64, 64, 256],[1, 3, 1], 3, 1),
    RepeatingBottleNecks(256, [128, 128, 512],[1, 3, 1], 4, 2),
    RepeatingBottleNecks(512, [256, 256, 1024],[1, 3, 1], 6, 2),
    RepeatingBottleNecks(1024, [512, 512, 2048], [1, 3, 1], 3, 2),
    nn.AvgPool2d(7, 1),
    Lambda(lambda x: x.view(x.shape[-1], -1)),
    nn.Linear(2048, 1000),
    nn.Softmax(dim=1)
)
~ 101-layer ResNet
resnet101 = nn.Sequential(
    nn.Conv2d(3, 64, 7, 2, 3), 
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3, 2, 1),
    RepeatingBottleNecks(64, [64, 64, 256],[1, 3, 1], 3, 1),
    RepeatingBottleNecks(256, [128, 128, 512],[1, 3, 1], 4, 2),
    RepeatingBottleNecks(512, [256, 256, 1024],[1, 3, 1], 23, 2),
    RepeatingBottleNecks(1024, [512, 512, 2048], [1, 3, 1], 3, 2),
    nn.AvgPool2d(7, 1),
    Lambda(lambda x: x.view(x.shape[-1], -1)),
    nn.Linear(2048, 1000),
    nn.Softmax(dim=1)
)
~ 152-layer ResNet
resnet152 = nn.Sequential(
    nn.Conv2d(3, 64, 7, 2, 3), 
    nn.BatchNorm2d(64),
    nn.MaxPool2d(3, 2, 1),
    RepeatingBottleNecks(64, [64, 64, 256],[1, 3, 1], 3, 1),
    RepeatingBottleNecks(256, [128, 128, 512],[1, 3, 1], 8, 2),
    RepeatingBottleNecks(512, [256, 256, 1024],[1, 3, 1], 36, 2),
    RepeatingBottleNecks(1024, [512, 512, 2048], [1, 3, 1], 3, 2),
    nn.AvgPool2d(7, 1),
    Lambda(lambda x: x.view(x.shape[-1], -1)),
    nn.Linear(2048, 1000),
    nn.Softmax(dim=1)
)