fold.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. import torch.nn.functional as F
  2. from torch import Tensor
  3. from torch.nn.common_types import _size_any_t
  4. from .module import Module
  5. __all__ = ["Fold", "Unfold"]
  6. class Fold(Module):
  7. (
  8. r"""Combines an array of sliding local blocks into a large containing tensor.
  9. Consider a batched :attr:`input` tensor containing sliding local blocks,
  10. e.g., patches of images, of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`,
  11. where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})`
  12. is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})`
  13. spatial locations each containing a :math:`C`-channeled vector), and
  14. :math:`L` is the total number of blocks. (This is exactly the
  15. same specification as the output shape of :class:`~torch.nn.Unfold`.) This
  16. operation combines these local blocks into the large :attr:`output` tensor
  17. of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
  18. by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
  19. arguments must satisfy
  20. .. math::
  21. L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
  22. - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
  23. where :math:`d` is over all spatial dimensions.
  24. * :attr:`output_size` describes the spatial shape of the large containing
  25. tensor of the sliding local blocks. It is useful to resolve the ambiguity
  26. when multiple input shapes map to same number of sliding blocks, e.g.,
  27. with ``stride > 0``.
  28. The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
  29. how the sliding blocks are retrieved.
  30. * :attr:`stride` controls the stride for the sliding blocks.
  31. * :attr:`padding` controls the amount of implicit zero-paddings on both
  32. sides for :attr:`padding` number of points for each dimension before
  33. reshaping.
  34. """
  35. """
  36. * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
  37. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
  38. """
  39. r"""
  40. Args:
  41. output_size (int or tuple): the shape of the spatial dimensions of the
  42. output (i.e., ``output.sizes()[2:]``)
  43. kernel_size (int or tuple): the size of the sliding blocks
  44. dilation (int or tuple, optional): a parameter that controls the
  45. stride of elements within the
  46. neighborhood. Default: 1
  47. padding (int or tuple, optional): implicit zero padding to be added on
  48. both sides of input. Default: 0
  49. stride (int or tuple): the stride of the sliding blocks in the input
  50. spatial dimensions. Default: 1
  51. * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
  52. :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
  53. their values will be replicated across all spatial dimensions.
  54. * For the case of two output spatial dimensions this operation is sometimes
  55. called ``col2im``.
  56. .. note::
  57. :class:`~torch.nn.Fold` calculates each combined value in the resulting
  58. large tensor by summing all values from all containing blocks.
  59. :class:`~torch.nn.Unfold` extracts the values in the local blocks by
  60. copying from the large tensor. So, if the blocks overlap, they are not
  61. inverses of each other.
  62. In general, folding and unfolding operations are related as
  63. follows. Consider :class:`~torch.nn.Fold` and
  64. :class:`~torch.nn.Unfold` instances created with the same
  65. parameters:
  66. >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
  67. >>> fold = nn.Fold(output_size=..., **fold_params)
  68. >>> unfold = nn.Unfold(**fold_params)
  69. Then for any (supported) ``input`` tensor the following
  70. equality holds:
  71. ::
  72. fold(unfold(input)) == divisor * input
  73. where ``divisor`` is a tensor that depends only on the shape
  74. and dtype of the ``input``:
  75. >>> # xdoctest: +SKIP
  76. >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
  77. >>> divisor = fold(unfold(input_ones))
  78. When the ``divisor`` tensor contains no zero elements, then
  79. ``fold`` and ``unfold`` operations are inverses of each
  80. other (up to constant divisor).
  81. .. warning::
  82. Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported.
  83. Shape:
  84. - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)`
  85. - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
  86. or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
  87. Examples::
  88. >>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2))
  89. >>> input = torch.randn(1, 3 * 2 * 2, 12)
  90. >>> output = fold(input)
  91. >>> output.size()
  92. torch.Size([1, 3, 4, 5])
  93. .. _link:
  94. https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
  95. """
  96. )
  97. __constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"]
  98. output_size: _size_any_t
  99. kernel_size: _size_any_t
  100. dilation: _size_any_t
  101. padding: _size_any_t
  102. stride: _size_any_t
  103. def __init__(
  104. self,
  105. output_size: _size_any_t,
  106. kernel_size: _size_any_t,
  107. dilation: _size_any_t = 1,
  108. padding: _size_any_t = 0,
  109. stride: _size_any_t = 1,
  110. ) -> None:
  111. super().__init__()
  112. self.output_size = output_size
  113. self.kernel_size = kernel_size
  114. self.dilation = dilation
  115. self.padding = padding
  116. self.stride = stride
  117. def forward(self, input: Tensor) -> Tensor:
  118. """
  119. Runs the forward pass.
  120. """
  121. return F.fold(
  122. input,
  123. self.output_size,
  124. self.kernel_size,
  125. self.dilation,
  126. self.padding,
  127. self.stride,
  128. )
  129. def extra_repr(self) -> str:
  130. """
  131. Return the extra representation of the module.
  132. """
  133. return (
  134. "output_size={output_size}, kernel_size={kernel_size}, "
  135. "dilation={dilation}, padding={padding}, stride={stride}".format(
  136. **self.__dict__
  137. )
  138. )
  139. class Unfold(Module):
  140. (
  141. r"""Extracts sliding local blocks from a batched input tensor.
  142. Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
  143. where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
  144. and :math:`*` represent arbitrary spatial dimensions. This operation flattens
  145. each sliding :attr:`kernel_size`-sized block within the spatial dimensions
  146. of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output`
  147. tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where
  148. :math:`C \times \prod(\text{kernel\_size})` is the total number of values
  149. within each block (a block has :math:`\prod(\text{kernel\_size})` spatial
  150. locations each containing a :math:`C`-channeled vector), and :math:`L` is
  151. the total number of such blocks:
  152. .. math::
  153. L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
  154. - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
  155. where :math:`\text{spatial\_size}` is formed by the spatial dimensions
  156. of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
  157. dimensions.
  158. Therefore, indexing :attr:`output` at the last dimension (column dimension)
  159. gives all values within a certain block.
  160. The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
  161. how the sliding blocks are retrieved.
  162. * :attr:`stride` controls the stride for the sliding blocks.
  163. * :attr:`padding` controls the amount of implicit zero-paddings on both
  164. sides for :attr:`padding` number of points for each dimension before
  165. reshaping.
  166. """
  167. """
  168. * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
  169. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
  170. """
  171. r"""
  172. Args:
  173. kernel_size (int or tuple): the size of the sliding blocks
  174. dilation (int or tuple, optional): a parameter that controls the
  175. stride of elements within the
  176. neighborhood. Default: 1
  177. padding (int or tuple, optional): implicit zero padding to be added on
  178. both sides of input. Default: 0
  179. stride (int or tuple, optional): the stride of the sliding blocks in the input
  180. spatial dimensions. Default: 1
  181. * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
  182. :attr:`stride` is an int or a tuple of length 1, their values will be
  183. replicated across all spatial dimensions.
  184. * For the case of two input spatial dimensions this operation is sometimes
  185. called ``im2col``.
  186. .. note::
  187. :class:`~torch.nn.Fold` calculates each combined value in the resulting
  188. large tensor by summing all values from all containing blocks.
  189. :class:`~torch.nn.Unfold` extracts the values in the local blocks by
  190. copying from the large tensor. So, if the blocks overlap, they are not
  191. inverses of each other.
  192. In general, folding and unfolding operations are related as
  193. follows. Consider :class:`~torch.nn.Fold` and
  194. :class:`~torch.nn.Unfold` instances created with the same
  195. parameters:
  196. >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
  197. >>> fold = nn.Fold(output_size=..., **fold_params)
  198. >>> unfold = nn.Unfold(**fold_params)
  199. Then for any (supported) ``input`` tensor the following
  200. equality holds:
  201. ::
  202. fold(unfold(input)) == divisor * input
  203. where ``divisor`` is a tensor that depends only on the shape
  204. and dtype of the ``input``:
  205. >>> # xdoctest: +SKIP
  206. >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
  207. >>> divisor = fold(unfold(input_ones))
  208. When the ``divisor`` tensor contains no zero elements, then
  209. ``fold`` and ``unfold`` operations are inverses of each
  210. other (up to constant divisor).
  211. .. warning::
  212. Currently, only 4-D input tensors (batched image-like tensors) are
  213. supported.
  214. Shape:
  215. - Input: :math:`(N, C, *)`
  216. - Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above
  217. Examples::
  218. >>> unfold = nn.Unfold(kernel_size=(2, 3))
  219. >>> input = torch.randn(2, 5, 3, 4)
  220. >>> output = unfold(input)
  221. >>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels)
  222. >>> # 4 blocks (2x3 kernels) in total in the 3x4 input
  223. >>> output.size()
  224. torch.Size([2, 30, 4])
  225. >>> # xdoctest: +IGNORE_WANT
  226. >>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
  227. >>> inp = torch.randn(1, 3, 10, 12)
  228. >>> w = torch.randn(2, 3, 4, 5)
  229. >>> inp_unf = torch.nn.functional.unfold(inp, (4, 5))
  230. >>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
  231. >>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
  232. >>> # or equivalently (and avoiding a copy),
  233. >>> # out = out_unf.view(1, 2, 7, 8)
  234. >>> (torch.nn.functional.conv2d(inp, w) - out).abs().max()
  235. tensor(1.9073e-06)
  236. .. _link:
  237. https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
  238. """
  239. )
  240. __constants__ = ["kernel_size", "dilation", "padding", "stride"]
  241. kernel_size: _size_any_t
  242. dilation: _size_any_t
  243. padding: _size_any_t
  244. stride: _size_any_t
  245. def __init__(
  246. self,
  247. kernel_size: _size_any_t,
  248. dilation: _size_any_t = 1,
  249. padding: _size_any_t = 0,
  250. stride: _size_any_t = 1,
  251. ) -> None:
  252. super().__init__()
  253. self.kernel_size = kernel_size
  254. self.dilation = dilation
  255. self.padding = padding
  256. self.stride = stride
  257. def forward(self, input: Tensor) -> Tensor:
  258. """
  259. Runs the forward pass.
  260. """
  261. return F.unfold(
  262. input, self.kernel_size, self.dilation, self.padding, self.stride
  263. )
  264. def extra_repr(self) -> str:
  265. """
  266. Return the extra representation of the module.
  267. """
  268. return (
  269. "kernel_size={kernel_size}, dilation={dilation}, padding={padding},"
  270. " stride={stride}".format(**self.__dict__)
  271. )