audio_utils.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. # Copyright 2023 The HuggingFace Team. All rights reserved.
  2. import datetime
  3. import platform
  4. import subprocess
  5. import numpy as np
  6. def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.ndarray:
  7. """
  8. Helper function to read an audio file through ffmpeg.
  9. """
  10. ar = f"{sampling_rate}"
  11. ac = "1"
  12. format_for_conversion = "f32le"
  13. ffmpeg_command = [
  14. "ffmpeg",
  15. "-i",
  16. "pipe:0",
  17. "-ac",
  18. ac,
  19. "-ar",
  20. ar,
  21. "-f",
  22. format_for_conversion,
  23. "-hide_banner",
  24. "-loglevel",
  25. "quiet",
  26. "pipe:1",
  27. ]
  28. try:
  29. with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
  30. output_stream = ffmpeg_process.communicate(bpayload)
  31. except FileNotFoundError as error:
  32. raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
  33. out_bytes = output_stream[0]
  34. audio = np.frombuffer(out_bytes, np.float32)
  35. if audio.shape[0] == 0:
  36. raise ValueError(
  37. "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
  38. "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
  39. "URL, ensure that the URL is the full address to **download** the audio file."
  40. )
  41. return audio
  42. def ffmpeg_microphone(
  43. sampling_rate: int,
  44. chunk_length_s: float,
  45. format_for_conversion: str = "f32le",
  46. ffmpeg_input_device: str | None = None,
  47. ffmpeg_additional_args: list[str] | None = None,
  48. ):
  49. """
  50. Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
  51. input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
  52. 'dshow' on Windows.
  53. Arguments:
  54. sampling_rate (`int`):
  55. The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
  56. avoid resampling later.
  57. chunk_length_s (`float` or `int`):
  58. The length of the maximum chunk of audio to be sent returned.
  59. format_for_conversion (`str`, defaults to `f32le`):
  60. The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
  61. could also be used.
  62. ffmpeg_input_device (`str`, *optional*):
  63. The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
  64. the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
  65. for how to specify and list input devices.
  66. ffmpeg_additional_args (`list[str]`, *optional*):
  67. Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
  68. process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
  69. with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
  70. Returns:
  71. A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
  72. `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
  73. """
  74. ar = f"{sampling_rate}"
  75. ac = "1"
  76. if format_for_conversion == "s16le":
  77. size_of_sample = 2
  78. elif format_for_conversion == "f32le":
  79. size_of_sample = 4
  80. else:
  81. raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
  82. system = platform.system()
  83. if system == "Linux":
  84. format_ = "alsa"
  85. input_ = ffmpeg_input_device or "default"
  86. elif system == "Darwin":
  87. format_ = "avfoundation"
  88. input_ = ffmpeg_input_device or ":default"
  89. elif system == "Windows":
  90. format_ = "dshow"
  91. input_ = ffmpeg_input_device or _get_microphone_name()
  92. ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
  93. ffmpeg_command = [
  94. "ffmpeg",
  95. "-f",
  96. format_,
  97. "-i",
  98. input_,
  99. "-ac",
  100. ac,
  101. "-ar",
  102. ar,
  103. "-f",
  104. format_for_conversion,
  105. "-fflags",
  106. "nobuffer",
  107. "-hide_banner",
  108. "-loglevel",
  109. "quiet",
  110. "pipe:1",
  111. ]
  112. ffmpeg_command.extend(ffmpeg_additional_args)
  113. chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
  114. iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
  115. yield from iterator
  116. def ffmpeg_microphone_live(
  117. sampling_rate: int,
  118. chunk_length_s: float,
  119. stream_chunk_s: int | None = None,
  120. stride_length_s: tuple[float, float] | float | None = None,
  121. format_for_conversion: str = "f32le",
  122. ffmpeg_input_device: str | None = None,
  123. ffmpeg_additional_args: list[str] | None = None,
  124. ):
  125. """
  126. Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
  127. from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
  128. errors on the "sides" of the various chunks. The default input device will be used unless another input device is
  129. specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
  130. Arguments:
  131. sampling_rate (`int`):
  132. The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
  133. avoid resampling later.
  134. chunk_length_s (`float` or `int`):
  135. The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
  136. stream_chunk_s (`float` or `int`):
  137. The length of the minimal temporary audio to be returned.
  138. stride_length_s (`float` or `int` or `(float, float)`, *optional*):
  139. The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
  140. an audio sample but without using that part to actually make the prediction. Setting this does not change
  141. the length of the chunk.
  142. format_for_conversion (`str`, *optional*, defaults to `f32le`):
  143. The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
  144. could also be used.
  145. ffmpeg_input_device (`str`, *optional*):
  146. The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
  147. the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
  148. for how to specify and list input devices.
  149. ffmpeg_additional_args (`list[str]`, *optional*):
  150. Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
  151. process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
  152. with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
  153. Return:
  154. A generator yielding dictionaries of the following form
  155. `{"sampling_rate": int, "raw": np.ndarray, "partial" bool}` With optionally a `"stride" (int, int)` key if
  156. `stride_length_s` is defined.
  157. `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
  158. is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
  159. """
  160. if stream_chunk_s is not None:
  161. chunk_s = stream_chunk_s
  162. else:
  163. chunk_s = chunk_length_s
  164. microphone = ffmpeg_microphone(
  165. sampling_rate,
  166. chunk_s,
  167. format_for_conversion=format_for_conversion,
  168. ffmpeg_input_device=ffmpeg_input_device,
  169. ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
  170. )
  171. if format_for_conversion == "s16le":
  172. dtype = np.int16
  173. size_of_sample = 2
  174. elif format_for_conversion == "f32le":
  175. dtype = np.float32
  176. size_of_sample = 4
  177. else:
  178. raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
  179. if stride_length_s is None:
  180. stride_length_s = chunk_length_s / 6
  181. chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
  182. if isinstance(stride_length_s, (int, float)):
  183. stride_length_s = [stride_length_s, stride_length_s]
  184. stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
  185. stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
  186. audio_time = datetime.datetime.now()
  187. delta = datetime.timedelta(seconds=chunk_s)
  188. for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
  189. # Put everything back in numpy scale
  190. item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
  191. item["stride"] = (
  192. item["stride"][0] // size_of_sample,
  193. item["stride"][1] // size_of_sample,
  194. )
  195. item["sampling_rate"] = sampling_rate
  196. audio_time += delta
  197. if datetime.datetime.now() > audio_time + 10 * delta:
  198. # We're late !! SKIP
  199. continue
  200. yield item
  201. def chunk_bytes_iter(iterator, chunk_len: int, stride: tuple[int, int], stream: bool = False):
  202. """
  203. Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
  204. get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
  205. """
  206. acc = b""
  207. stride_left, stride_right = stride
  208. if stride_left + stride_right >= chunk_len:
  209. raise ValueError(
  210. f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
  211. )
  212. _stride_left = 0
  213. for raw in iterator:
  214. acc += raw
  215. if stream and len(acc) < chunk_len:
  216. stride = (_stride_left, 0)
  217. yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
  218. else:
  219. while len(acc) >= chunk_len:
  220. # We are flushing the accumulator
  221. stride = (_stride_left, stride_right)
  222. item = {"raw": acc[:chunk_len], "stride": stride}
  223. if stream:
  224. item["partial"] = False
  225. yield item
  226. _stride_left = stride_left
  227. acc = acc[chunk_len - stride_left - stride_right :]
  228. # Last chunk
  229. if len(acc) > stride_left:
  230. item = {"raw": acc, "stride": (_stride_left, 0)}
  231. if stream:
  232. item["partial"] = False
  233. yield item
  234. def _ffmpeg_stream(ffmpeg_command, buflen: int):
  235. """
  236. Internal function to create the generator of data through ffmpeg
  237. """
  238. bufsize = 2**24 # 16Mo
  239. try:
  240. with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
  241. while True:
  242. raw = ffmpeg_process.stdout.read(buflen)
  243. if raw == b"":
  244. break
  245. yield raw
  246. except FileNotFoundError as error:
  247. raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
  248. def _get_microphone_name():
  249. """
  250. Retrieve the microphone name in Windows .
  251. """
  252. command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
  253. try:
  254. ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
  255. microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
  256. if microphone_lines:
  257. microphone_name = microphone_lines[0].split('"')[1]
  258. print(f"Using microphone: {microphone_name}")
  259. return f"audio={microphone_name}"
  260. except FileNotFoundError:
  261. print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
  262. return "default"