lve.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. # Copyright The Lightning team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import List
  15. import torch
  16. from torch import Tensor
  17. def lip_vertex_error(
  18. vertices_pred: Tensor,
  19. vertices_gt: Tensor,
  20. mouth_map: List[int],
  21. validate_args: bool = True,
  22. ) -> Tensor:
  23. r"""Compute Lip Vertex Error (LVE) for 3D talking head evaluation.
  24. The Lip Vertex Error (LVE) metric evaluates the quality of lip synchronization in 3D facial animations by measuring
  25. the maximum Euclidean distance (L2 error) between corresponding lip vertices of the generated and ground truth
  26. meshes for each frame. The metric is defined as:
  27. .. math::
  28. \text{LVE} = \frac{1}{N} \sum_{i=1}^{N} \max_{v \in \text{lip}} \|x_{i,v} - \hat{x}_{i,v}\|_2^2
  29. where :math:`N` is the number of frames, :math:`x_{i,v}` represents the 3D coordinates of vertex :math:`v` in the
  30. lip region of the ground truth frame :math:`i`, and :math:`\hat{x}_{i,v}` represents the corresponding vertex in
  31. the predicted frame. The metric computes the maximum squared L2 distance between corresponding lip vertices for each
  32. frame and averages across all frames. A lower LVE value indicates better lip synchronization quality.
  33. Args:
  34. vertices_pred: Predicted vertices tensor of shape (T, V, 3) where T is number of frames,
  35. V is number of vertices, and 3 represents XYZ coordinates
  36. vertices_gt: Ground truth vertices tensor of shape (T', V, 3) where T' can be different from T
  37. mouth_map: List of vertex indices corresponding to the mouth region
  38. validate_args: bool indicating if input arguments and tensors should be validated for correctness.
  39. Set to ``False`` for faster computations.
  40. Returns:
  41. torch.Tensor: Scalar tensor containing the mean LVE value across all frames
  42. Raises:
  43. ValueError:
  44. If the number of dimensions of `vertices_pred` or `vertices_gt` is not 3.
  45. If vertex dimensions (V) or coordinate dimensions (3) don't match
  46. If ``mouth_map`` is empty or contains invalid indices
  47. Example:
  48. >>> import torch
  49. >>> from torchmetrics.functional.multimodal import lip_vertex_error
  50. >>> vertices_pred = torch.randn(10, 100, 3, generator=torch.manual_seed(42))
  51. >>> vertices_gt = torch.randn(10, 100, 3, generator=torch.manual_seed(43))
  52. >>> mouth_map = [0, 1, 2, 3, 4]
  53. >>> lip_vertex_error(vertices_pred, vertices_gt, mouth_map)
  54. tensor(12.7688)
  55. """
  56. if validate_args:
  57. if vertices_pred.ndim != 3 or vertices_gt.ndim != 3:
  58. raise ValueError(
  59. f"Expected both vertices_pred and vertices_gt to have 3 dimensions but got "
  60. f"{vertices_pred.ndim} and {vertices_gt.ndim} dimensions respectively."
  61. )
  62. if vertices_pred.shape[1:] != vertices_gt.shape[1:]:
  63. raise ValueError(
  64. f"Expected vertices_pred and vertices_gt to have same vertex and coordinate dimensions but got "
  65. f"shapes {vertices_pred.shape} and {vertices_gt.shape}."
  66. )
  67. if not mouth_map:
  68. raise ValueError("mouth_map cannot be empty.")
  69. if max(mouth_map) >= vertices_pred.shape[1]:
  70. raise ValueError(
  71. f"mouth_map contains invalid vertex indices. Max index {max(mouth_map)} is larger than "
  72. f"number of vertices {vertices_pred.shape[1]}."
  73. )
  74. min_frames = min(vertices_pred.shape[0], vertices_gt.shape[0])
  75. vertices_pred = vertices_pred[:min_frames]
  76. vertices_gt = vertices_gt[:min_frames]
  77. diff = vertices_gt[:, mouth_map, :] - vertices_pred[:, mouth_map, :] # Shape: (T, M, 3)
  78. sq_dist = torch.sum(diff**2, dim=-1) # Shape: (T, M)
  79. max_per_frame = torch.max(sq_dist, dim=1).values # Shape: (T,)
  80. return torch.mean(max_per_frame)