automatic_speech_recognition.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. # Copyright 2021 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from collections import defaultdict
  15. from typing import TYPE_CHECKING, Any, Union
  16. import httpx
  17. import numpy as np
  18. from ..generation import GenerationConfig
  19. from ..tokenization_python import PreTrainedTokenizer
  20. from ..utils import is_torch_available, is_torchaudio_available, is_torchcodec_available, logging
  21. from .audio_utils import ffmpeg_read
  22. from .base import ChunkPipeline
  23. if TYPE_CHECKING:
  24. from pyctcdecode import BeamSearchDecoderCTC
  25. from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
  26. from ..modeling_utils import PreTrainedModel
  27. logger = logging.get_logger(__name__)
  28. if is_torch_available():
  29. import torch
  30. from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
  31. def rescale_stride(stride, ratio):
  32. """
  33. Rescales the stride values from audio space to tokens/logits space.
  34. (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
  35. """
  36. # Shape is [B, SEQ] for tokens
  37. # [B, SEQ, V] for logits
  38. new_strides = []
  39. for input_n, left, right in stride:
  40. token_n = int(round(input_n * ratio))
  41. left = int(round(left / input_n * token_n))
  42. right = int(round(right / input_n * token_n))
  43. new_stride = (token_n, left, right)
  44. new_strides.append(new_stride)
  45. return new_strides
  46. def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
  47. inputs_len = inputs.shape[0]
  48. step = chunk_len - stride_left - stride_right
  49. for chunk_start_idx in range(0, inputs_len, step):
  50. chunk_end_idx = chunk_start_idx + chunk_len
  51. chunk = inputs[chunk_start_idx:chunk_end_idx]
  52. processed = feature_extractor(
  53. chunk,
  54. sampling_rate=feature_extractor.sampling_rate,
  55. return_tensors="pt",
  56. return_attention_mask=True,
  57. )
  58. if dtype is not None:
  59. processed = processed.to(dtype=dtype)
  60. _stride_left = 0 if chunk_start_idx == 0 else stride_left
  61. is_last = chunk_end_idx >= inputs_len
  62. _stride_right = 0 if is_last else stride_right
  63. chunk_len = chunk.shape[0]
  64. stride = (chunk_len, _stride_left, _stride_right)
  65. if chunk.shape[0] > _stride_left:
  66. yield {"is_last": is_last, "stride": stride, **processed}
  67. if is_last:
  68. break
  69. def _find_longest_common_sequence(sequences, tokenizer):
  70. # TODO Use a faster algorithm this can probably be done in O(n)
  71. # using suffix array.
  72. # It might be tedious to do because of fault tolerance.
  73. # We actually have a really good property which is that the total sequence
  74. # MUST be those subsequences in order.
  75. # Also the algorithm should be more tolerant to errors.
  76. sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
  77. for new_seq in sequences[1:]:
  78. new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
  79. index = 0
  80. max_ = 0.0
  81. for i in range(1, len(new_sequence) + 1):
  82. # epsilon to favor long perfect matches
  83. eps = i / 10000.0
  84. matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
  85. matching = matches / i + eps
  86. if matches > 1 and matching > max_:
  87. index = i
  88. max_ = matching
  89. sequence.extend(new_sequence[index:])
  90. return np.array(sequence)
  91. class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
  92. """
  93. Pipeline that aims at extracting spoken text contained within some audio.
  94. The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
  95. to support multiple audio formats
  96. Unless the model you're using explicitly sets these generation parameters in its configuration files
  97. (`generation_config.json`), the following default values will be used:
  98. - max_new_tokens: 256
  99. - num_beams: 5
  100. Example:
  101. ```python
  102. >>> from transformers import pipeline
  103. >>> transcriber = pipeline(model="openai/whisper-base")
  104. >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
  105. {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
  106. ```
  107. Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
  108. Arguments:
  109. model ([`PreTrainedModel`]):
  110. The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
  111. [`PreTrainedModel`].
  112. feature_extractor ([`SequenceFeatureExtractor`], *optional*):
  113. The feature extractor that will be used by the pipeline to encode waveform for the model.
  114. tokenizer ([`PreTrainedTokenizer`], *optional*):
  115. The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
  116. [`PreTrainedTokenizer`].
  117. decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
  118. [PyCTCDecode's
  119. BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
  120. can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
  121. device (Union[`int`, `torch.device`], *optional*):
  122. Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
  123. model on the associated CUDA device id.
  124. """
  125. _pipeline_calls_generate = True
  126. _load_processor = False
  127. _load_image_processor = False
  128. _load_feature_extractor = True
  129. _load_tokenizer = True
  130. # Make sure the docstring is updated when the default generation config is changed
  131. _default_generation_config = GenerationConfig(
  132. max_new_tokens=256,
  133. num_beams=5, # follows openai's whisper implementation
  134. )
  135. def __init__(
  136. self,
  137. model: "PreTrainedModel",
  138. feature_extractor: Union["SequenceFeatureExtractor", str] | None = None,
  139. tokenizer: PreTrainedTokenizer | None = None,
  140. decoder: Union["BeamSearchDecoderCTC", str] | None = None,
  141. device: Union[int, "torch.device"] | None = None,
  142. **kwargs,
  143. ):
  144. # set the model type so we can check we have the right pre- and post-processing parameters
  145. if model.config.model_type == "whisper":
  146. self.type = "seq2seq_whisper"
  147. elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
  148. self.type = "seq2seq"
  149. elif decoder is not None:
  150. self.decoder = decoder
  151. self.type = "ctc_with_lm"
  152. else:
  153. self.type = "ctc"
  154. super().__init__(model, tokenizer, feature_extractor, device=device, **kwargs)
  155. def __call__(self, inputs: np.ndarray | bytes | str | dict, **kwargs: Any) -> list[dict[str, Any]]:
  156. """
  157. Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
  158. documentation for more information.
  159. Args:
  160. inputs (`np.ndarray` or `bytes` or `str` or `dict`):
  161. The inputs is either :
  162. - `str` that is either the filename of a local audio file, or a public URL address to download the
  163. audio file. The file will be read at the correct sampling rate to get the waveform using
  164. *ffmpeg*. This requires *ffmpeg* to be installed on the system.
  165. - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
  166. same way.
  167. - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
  168. Raw audio at the correct sampling rate (no further check will be done)
  169. - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
  170. pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
  171. np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
  172. treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
  173. inference to provide more context to the model). Only use `stride` with CTC models.
  174. return_timestamps (*optional*, `str` or `bool`):
  175. Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
  176. other sequence-to-sequence models.
  177. For CTC models, timestamps can take one of two formats:
  178. - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
  179. instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
  180. 0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
  181. `0.6` seconds.
  182. - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
  183. instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
  184. (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
  185. before `0.9` seconds.
  186. For the Whisper model, timestamps can take one of two formats:
  187. - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
  188. through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
  189. by inspecting the cross-attention weights.
  190. - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
  191. For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
  192. model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
  193. Note that a segment of text refers to a sequence of one or more words, rather than individual
  194. words as with word-level timestamps.
  195. generate_kwargs (`dict`, *optional*):
  196. The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
  197. complete overview of generate, check the [following
  198. guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
  199. Return:
  200. `Dict`: A dictionary with the following keys:
  201. - **text** (`str`): The recognized text.
  202. - **chunks** (*optional(, `list[Dict]`)
  203. When using `return_timestamps`, the `chunks` will become a list containing all the various text
  204. chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
  205. "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
  206. `"".join(chunk["text"] for chunk in output["chunks"])`.
  207. """
  208. return super().__call__(inputs, **kwargs)
  209. def _sanitize_parameters(
  210. self,
  211. chunk_length_s=None,
  212. stride_length_s=None,
  213. ignore_warning=None,
  214. decoder_kwargs=None,
  215. return_timestamps=None,
  216. return_language=None,
  217. **generate_kwargs,
  218. ):
  219. preprocess_params = {}
  220. forward_params = {}
  221. postprocess_params = {}
  222. # Preprocess params
  223. if chunk_length_s is not None:
  224. if self.type in ["seq2seq", "seq2seq_whisper"] and not ignore_warning:
  225. type_warning = (
  226. "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
  227. " be entirely accurate and will have caveats. More information:"
  228. " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
  229. " ignore_warning=True)."
  230. )
  231. if self.type == "seq2seq_whisper":
  232. type_warning += (
  233. " To use Whisper for long-form transcription, use rather the model's `generate` method directly "
  234. "as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. "
  235. "Long-form Transcription)."
  236. )
  237. logger.warning(type_warning)
  238. preprocess_params["chunk_length_s"] = chunk_length_s
  239. if stride_length_s is not None:
  240. preprocess_params["stride_length_s"] = stride_length_s
  241. # Forward params
  242. # BC: accept a dictionary of generation kwargs (as opposed to **generate_kwargs)
  243. if "generate_kwargs" in generate_kwargs:
  244. forward_params.update(generate_kwargs.pop("generate_kwargs"))
  245. # Default use for kwargs: they are generation-time kwargs
  246. forward_params.update(generate_kwargs)
  247. if getattr(self, "assistant_model", None) is not None:
  248. forward_params["assistant_model"] = self.assistant_model
  249. if getattr(self, "assistant_tokenizer", None) is not None:
  250. forward_params["tokenizer"] = self.tokenizer
  251. forward_params["assistant_tokenizer"] = self.assistant_tokenizer
  252. # Postprocess params
  253. if decoder_kwargs is not None:
  254. postprocess_params["decoder_kwargs"] = decoder_kwargs
  255. if return_language is not None:
  256. if self.type != "seq2seq_whisper":
  257. raise ValueError("Only Whisper can return language for now.")
  258. postprocess_params["return_language"] = return_language
  259. # Parameter used in more than one place
  260. # in some models like whisper, the generation config has a `return_timestamps` key
  261. if hasattr(self, "generation_config") and hasattr(self.generation_config, "return_timestamps"):
  262. return_timestamps = return_timestamps or self.generation_config.return_timestamps
  263. if return_timestamps is not None:
  264. # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
  265. if self.type == "seq2seq" and return_timestamps:
  266. raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
  267. if self.type == "ctc_with_lm" and return_timestamps != "word":
  268. raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
  269. if self.type == "ctc" and return_timestamps not in ["char", "word"]:
  270. raise ValueError(
  271. "CTC can either predict character level timestamps, or word level timestamps. "
  272. "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
  273. )
  274. if self.type == "seq2seq_whisper" and return_timestamps == "char":
  275. raise ValueError(
  276. "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
  277. "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
  278. )
  279. forward_params["return_timestamps"] = return_timestamps
  280. postprocess_params["return_timestamps"] = return_timestamps
  281. return preprocess_params, forward_params, postprocess_params
  282. @property
  283. def _align_to(self):
  284. """Sample stride per output."""
  285. # XXX: Carefully, this variable will not exist in `seq2seq` setting.
  286. # Currently chunking is not possible at this level for `seq2seq` so
  287. # it's ok.
  288. align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
  289. if self.model.config.model_type == "lasr_ctc":
  290. # TODO: find a standard for that but not easy because input length -> mel length depends on the feature extractor
  291. # specific way of doing it
  292. # means the model take mel features as input, we align according to the hop length
  293. align_to *= self.feature_extractor.hop_length
  294. return align_to
  295. def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
  296. if isinstance(inputs, str):
  297. if inputs.startswith("http://") or inputs.startswith("https://"):
  298. # We need to actually check for a real protocol, otherwise it's impossible to use a local file
  299. # like http_huggingface_co.png
  300. inputs = httpx.get(inputs, follow_redirects=True).content
  301. else:
  302. with open(inputs, "rb") as f:
  303. inputs = f.read()
  304. if isinstance(inputs, bytes):
  305. inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
  306. stride = None
  307. extra = {}
  308. if is_torch_available():
  309. import torch
  310. if isinstance(inputs, torch.Tensor):
  311. inputs = inputs.cpu().numpy()
  312. if is_torchcodec_available():
  313. import torchcodec
  314. if isinstance(inputs, torchcodec.decoders.AudioDecoder):
  315. _audio_samples = inputs.get_all_samples()
  316. # torchcodec always returns (num_channels, num_samples)
  317. # while before (datasets < 4.0) we had (2, num_samples) if stereo, (num_samples,) if mono
  318. _array = _audio_samples.data
  319. _array = _array[0] if _array.ndim == 2 and _array.shape[0] == 1 else _array
  320. inputs = {"array": _array, "sampling_rate": _audio_samples.sample_rate}
  321. if isinstance(inputs, dict):
  322. stride = inputs.pop("stride", None)
  323. # Accepting `"array"` which is the key defined in `datasets` for
  324. # better integration
  325. if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
  326. raise ValueError(
  327. "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
  328. '"raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, '
  329. "containing the sampling_rate associated with that array"
  330. )
  331. _inputs = inputs.pop("raw", None)
  332. if _inputs is None:
  333. # Remove path which will not be used from `datasets`.
  334. inputs.pop("path", None)
  335. _inputs = inputs.pop("array", None)
  336. in_sampling_rate = inputs.pop("sampling_rate")
  337. extra = inputs
  338. inputs = _inputs
  339. if in_sampling_rate != self.feature_extractor.sampling_rate:
  340. if is_torchaudio_available():
  341. from torchaudio import functional as F
  342. else:
  343. raise ImportError(
  344. "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
  345. "The torchaudio package can be installed through: `pip install torchaudio`."
  346. )
  347. inputs = F.resample(
  348. torch.from_numpy(inputs) if isinstance(inputs, np.ndarray) else inputs,
  349. in_sampling_rate,
  350. self.feature_extractor.sampling_rate,
  351. ).numpy()
  352. ratio = self.feature_extractor.sampling_rate / in_sampling_rate
  353. else:
  354. ratio = 1
  355. if stride is not None:
  356. if stride[0] + stride[1] > inputs.shape[0]:
  357. raise ValueError("Stride is too large for input")
  358. # Stride needs to get the chunk length here, it's going to get
  359. # swallowed by the `feature_extractor` later, and then batching
  360. # can add extra data in the inputs, so we need to keep track
  361. # of the original length in the stride so we can cut properly.
  362. stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
  363. if not isinstance(inputs, (np.ndarray, torch.Tensor)):
  364. raise TypeError(f"We expect a numpy ndarray or torch tensor as input, got `{type(inputs)}`")
  365. if inputs.ndim != 1:
  366. logger.warning(
  367. f"We expect a single channel audio input for AutomaticSpeechRecognitionPipeline, got {inputs.ndim}. Taking the mean of the channels for mono conversion."
  368. )
  369. inputs = inputs.mean(axis=0)
  370. if chunk_length_s:
  371. if stride_length_s is None:
  372. stride_length_s = chunk_length_s / 6
  373. if isinstance(stride_length_s, (int, float)):
  374. stride_length_s = [stride_length_s, stride_length_s]
  375. align_to = self._align_to
  376. chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
  377. stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
  378. stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
  379. if chunk_len < stride_left + stride_right:
  380. raise ValueError("Chunk length must be superior to stride length")
  381. for item in chunk_iter(inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.dtype):
  382. yield {**item, **extra}
  383. else:
  384. if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
  385. processed = self.feature_extractor(
  386. inputs,
  387. sampling_rate=self.feature_extractor.sampling_rate,
  388. truncation=False,
  389. padding="longest",
  390. return_tensors="pt",
  391. return_attention_mask=True,
  392. )
  393. else:
  394. if self.type == "seq2seq_whisper" and stride is None:
  395. processed = self.feature_extractor(
  396. inputs,
  397. sampling_rate=self.feature_extractor.sampling_rate,
  398. return_tensors="pt",
  399. return_attention_mask=True,
  400. )
  401. else:
  402. processed = self.feature_extractor(
  403. inputs,
  404. sampling_rate=self.feature_extractor.sampling_rate,
  405. return_tensors="pt",
  406. return_attention_mask=True,
  407. )
  408. if self.dtype is not None:
  409. processed = processed.to(dtype=self.dtype)
  410. if stride is not None:
  411. if self.type == "seq2seq":
  412. raise ValueError("Stride is only usable with CTC models, try removing it !")
  413. processed["stride"] = stride
  414. yield {"is_last": True, **processed, **extra}
  415. def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
  416. attention_mask = model_inputs.pop("attention_mask", None)
  417. stride = model_inputs.pop("stride", None)
  418. num_frames = model_inputs.pop("num_frames", None)
  419. is_last = model_inputs.pop("is_last")
  420. if stride is not None and num_frames is not None:
  421. raise ValueError("num_frames must be used only when stride is None")
  422. if self.type in {"seq2seq", "seq2seq_whisper"}:
  423. # Consume values so we can let extra information flow freely through
  424. # the pipeline (important for `partial` in microphone)
  425. if "input_features" in model_inputs:
  426. inputs = model_inputs.pop("input_features")
  427. elif "input_values" in model_inputs:
  428. inputs = model_inputs.pop("input_values")
  429. else:
  430. raise ValueError(
  431. "Seq2Seq speech recognition model requires either a "
  432. f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
  433. )
  434. # custom processing for Whisper timestamps and word-level timestamps
  435. return_timestamps = return_timestamps or getattr(self.generation_config, "return_timestamps", False)
  436. if return_timestamps and self.type == "seq2seq_whisper":
  437. generate_kwargs["return_timestamps"] = bool(return_timestamps)
  438. if return_timestamps == "word":
  439. generate_kwargs["return_token_timestamps"] = True
  440. generate_kwargs["return_segments"] = True
  441. # User-defined `generation_config` passed to the pipeline call take precedence
  442. if "generation_config" not in generate_kwargs:
  443. generate_kwargs["generation_config"] = self.generation_config
  444. main_input_name = self.model.main_input_name if hasattr(self.model, "main_input_name") else "inputs"
  445. generate_kwargs = {
  446. main_input_name: inputs,
  447. "attention_mask": attention_mask,
  448. **generate_kwargs,
  449. }
  450. tokens = self.model.generate(**generate_kwargs)
  451. # whisper longform generation stores timestamps in "segments"
  452. if return_timestamps == "word" and self.type == "seq2seq_whisper":
  453. if "segments" not in tokens:
  454. out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
  455. else:
  456. token_timestamps = [
  457. torch.cat([segment["token_timestamps"] for segment in segment_list])
  458. for segment_list in tokens["segments"]
  459. ]
  460. out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
  461. else:
  462. out = {"tokens": tokens}
  463. if self.type == "seq2seq_whisper":
  464. if stride is not None:
  465. out["stride"] = stride
  466. else:
  467. inputs = {
  468. self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
  469. "attention_mask": attention_mask,
  470. }
  471. outputs = self.model(**inputs)
  472. logits = outputs.logits
  473. if self.type == "ctc_with_lm":
  474. out = {"logits": logits}
  475. else:
  476. out = {"tokens": logits.argmax(dim=-1)}
  477. if stride is not None:
  478. # Send stride to `postprocess`.
  479. # it needs to be handled there where
  480. # the pieces are to be concatenated.
  481. ratio = 1 / self._align_to
  482. if isinstance(stride, tuple):
  483. out["stride"] = rescale_stride([stride], ratio)[0]
  484. else:
  485. out["stride"] = rescale_stride(stride, ratio)
  486. # Leftover
  487. extra = model_inputs
  488. return {"is_last": is_last, **out, **extra}
  489. def postprocess(
  490. self, model_outputs, decoder_kwargs: dict | None = None, return_timestamps=None, return_language=None
  491. ):
  492. # Optional return types
  493. optional = {}
  494. final_items = []
  495. key = "logits" if self.type == "ctc_with_lm" else "tokens"
  496. stride = None
  497. for outputs in model_outputs:
  498. if outputs[key].dtype in (torch.bfloat16, torch.float16):
  499. items = outputs[key].to(torch.float32).numpy()
  500. else:
  501. items = outputs[key].numpy()
  502. stride = outputs.get("stride", None)
  503. if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
  504. total_n, left, right = stride
  505. # Total_n might be < logits.shape[1]
  506. # because of padding, that's why
  507. # we need to reconstruct this information
  508. # This won't work with left padding (which doesn't exist right now)
  509. right_n = total_n - right
  510. items = items[:, left:right_n]
  511. final_items.append(items)
  512. if stride and self.type == "seq2seq":
  513. items = _find_longest_common_sequence(final_items, self.tokenizer)
  514. elif self.type == "seq2seq_whisper":
  515. time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
  516. # Send the chunking back to seconds, it's easier to handle in whisper
  517. sampling_rate = self.feature_extractor.sampling_rate
  518. for output in model_outputs:
  519. if "stride" in output:
  520. chunk_len, stride_left, stride_right = output["stride"]
  521. # Go back in seconds
  522. chunk_len /= sampling_rate
  523. stride_left /= sampling_rate
  524. stride_right /= sampling_rate
  525. output["stride"] = chunk_len, stride_left, stride_right
  526. text, optional = self.tokenizer._decode_asr(
  527. model_outputs,
  528. return_timestamps=return_timestamps,
  529. return_language=return_language,
  530. time_precision=time_precision,
  531. )
  532. else:
  533. items = np.concatenate(final_items, axis=1)
  534. items = items.squeeze(0)
  535. if self.type == "ctc_with_lm":
  536. if decoder_kwargs is None:
  537. decoder_kwargs = {}
  538. beams = self.decoder.decode_beams(items, **decoder_kwargs)
  539. text = beams[0][0]
  540. if return_timestamps:
  541. # Simply cast from pyctcdecode format to wav2vec2 format to leverage
  542. # pre-existing code later
  543. chunk_offset = beams[0][2]
  544. offsets = []
  545. for word, (start_offset, end_offset) in chunk_offset:
  546. offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
  547. elif self.type != "seq2seq_whisper":
  548. skip_special_tokens = self.type != "ctc"
  549. text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
  550. if return_timestamps:
  551. offsets = self.tokenizer.decode(
  552. items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
  553. )["char_offsets"]
  554. if return_timestamps == "word":
  555. offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
  556. if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
  557. chunks = []
  558. align_to = self._align_to
  559. for item in offsets:
  560. start = item["start_offset"] * align_to
  561. start /= self.feature_extractor.sampling_rate
  562. stop = item["end_offset"] * align_to
  563. stop /= self.feature_extractor.sampling_rate
  564. chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})
  565. optional["chunks"] = chunks
  566. extra = defaultdict(list)
  567. for output in model_outputs:
  568. output.pop("tokens", None)
  569. output.pop("logits", None)
  570. output.pop("is_last", None)
  571. output.pop("stride", None)
  572. output.pop("token_timestamps", None)
  573. for k, v in output.items():
  574. extra[k].append(v)
  575. return {"text": text, **optional, **extra}