""" DropBlock, DropPath PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers. Papers: DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890) Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382) Code: DropBlock impl inspired by two Tensorflow impl that I liked: - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74 - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py Hacked together by / Copyright 2020 Ross Wightman """ from typing import List, Union import torch import torch.nn as nn import torch.nn.functional as F def drop_block_2d( x: torch.Tensor, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, couple_channels: bool = True, scale_by_keep: bool = True, ): """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf DropBlock with an experimental gaussian noise option. Args: x: Input tensor of shape (B, C, H, W). drop_prob: Probability of dropping a block. block_size: Size of the block to drop. gamma_scale: Scale factor for the drop probability. with_noise: If True, add gaussian noise to dropped regions instead of zeros. inplace: If True, perform operation in-place. couple_channels: If True, all channels share the same drop mask (per the original paper). If False, each channel gets an independent mask. scale_by_keep: If True, scale kept activations to maintain expected values. Returns: Tensor with dropped blocks, same shape as input. """ B, C, H, W = x.shape kh, kw = min(block_size, H), min(block_size, W) # Compute gamma (seed drop rate) - probability of dropping each spatial location gamma = float(gamma_scale * drop_prob * H * W) / float(kh * kw) / float((H - kh + 1) * (W - kw + 1)) # Generate drop mask: 1 at block centers to drop, 0 elsewhere # couple_channels=True means all channels share same spatial mask (matches paper) noise_shape = (B, 1 if couple_channels else C, H, W) with torch.no_grad(): block_mask = torch.empty(noise_shape, dtype=x.dtype, device=x.device).bernoulli_(gamma) # Expand block centers to full blocks using max pooling block_mask = F.max_pool2d( block_mask, kernel_size=(kh, kw), stride=1, padding=(kh // 2, kw // 2), ) # Handle even kernel sizes - max_pool2d output is 1 larger in each even dimension if kh % 2 == 0 or kw % 2 == 0: # Fix for even kernels proposed by https://github.com/crutcher block_mask = block_mask[..., (kh + 1) % 2:, (kw + 1) % 2:] keep_mask = 1. - block_mask if with_noise: with torch.no_grad(): noise = torch.empty_like(keep_mask).normal_() noise.mul_(block_mask) if inplace: x.mul_(keep_mask).add_(noise) else: x = x * keep_mask + noise else: if scale_by_keep: with torch.no_grad(): # Normalize to maintain expected values (scale up kept activations) normalize_scale = keep_mask.numel() / keep_mask.to(dtype=torch.float32).sum().add(1e-7) keep_mask.mul_(normalize_scale.to(x.dtype)) if inplace: x.mul_(keep_mask) else: x = x * keep_mask return x class DropBlock2d(nn.Module): """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf Args: drop_prob: Probability of dropping a block. block_size: Size of the block to drop. gamma_scale: Scale factor for the drop probability. with_noise: If True, add gaussian noise to dropped regions instead of zeros. inplace: If True, perform operation in-place. couple_channels: If True, all channels share the same drop mask (per the original paper). If False, each channel gets an independent mask. scale_by_keep: If True, scale kept activations to maintain expected values. """ def __init__( self, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, couple_channels: bool = True, scale_by_keep: bool = True, **kwargs, ): super().__init__() self.drop_prob = drop_prob self.gamma_scale = gamma_scale self.block_size = block_size self.with_noise = with_noise self.inplace = inplace self.couple_channels = couple_channels self.scale_by_keep = scale_by_keep # Backwards compatibility: silently consume args removed in v1.0.23, warn on unknown deprecated_args = {'batchwise', 'fast'} for k in kwargs: if k not in deprecated_args: import warnings warnings.warn(f"DropBlock2d() got unexpected keyword argument '{k}'") def forward(self, x): if not self.training or not self.drop_prob: return x return drop_block_2d( x, drop_prob=self.drop_prob, block_size=self.block_size, gamma_scale=self.gamma_scale, with_noise=self.with_noise, inplace=self.inplace, couple_channels=self.couple_channels, scale_by_keep=self.scale_by_keep, ) def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument. """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = x.new_empty(shape).bernoulli_(keep_prob) if keep_prob > 0.0 and scale_by_keep: random_tensor.div_(keep_prob) return x * random_tensor class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): super().__init__() self.drop_prob = drop_prob self.scale_by_keep = scale_by_keep def forward(self, x): return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) def extra_repr(self): return f'drop_prob={round(self.drop_prob,3):0.3f}' def calculate_drop_path_rates( drop_path_rate: float, depths: Union[int, List[int]], stagewise: bool = False, ) -> Union[List[float], List[List[float]]]: """Generate drop path rates for stochastic depth. This function handles two common patterns for drop path rate scheduling: 1. Per-block: Linear increase from 0 to drop_path_rate across all blocks 2. Stage-wise: Linear increase across stages, with same rate within each stage Args: drop_path_rate: Maximum drop path rate (at the end). depths: Either a single int for total depth (per-block mode) or list of ints for depths per stage (stage-wise mode). stagewise: If True, use stage-wise pattern. If False, use per-block pattern. When depths is a list, stagewise defaults to True. Returns: For per-block mode: List of drop rates, one per block. For stage-wise mode: List of lists, drop rates per stage. """ if isinstance(depths, int): # Single depth value - per-block pattern if stagewise: raise ValueError("stagewise=True requires depths to be a list of stage depths") dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depths, device='cpu')] return dpr else: # List of depths - can be either pattern total_depth = sum(depths) if stagewise: # Stage-wise pattern: same drop rate within each stage dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu').split(depths)] return dpr else: # Per-block pattern across all stages dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu')] return dpr