| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237 |
- # Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
- and remove unnecessary dependencies.
- """
- import base64
- import importlib
- import io
- import os
- import warnings
- from collections.abc import Sequence
- from io import BytesIO
- from typing import TYPE_CHECKING, Any, Union
- import httpx
- import numpy as np
- from packaging import version
- from .utils import (
- is_librosa_available,
- is_numpy_array,
- is_soundfile_available,
- is_torch_tensor,
- is_torchcodec_available,
- requires_backends,
- )
- if TYPE_CHECKING:
- import torch
- if is_soundfile_available():
- import soundfile as sf
- if is_librosa_available():
- import librosa
- # TODO: @eustlb, we actually don't need librosa but soxr is installed with librosa
- import soxr
- if is_torchcodec_available():
- TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec"))
- AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]]
- def load_audio(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
- """
- Loads `audio` to an np.ndarray object.
- Args:
- audio (`str` or `np.ndarray`):
- The audio to be loaded to the numpy array format.
- sampling_rate (`int`, *optional*, defaults to 16000):
- The sampling rate to be used when loading the audio. It should be same as the
- sampling rate the model you will be using further was trained with.
- timeout (`float`, *optional*):
- The timeout value in seconds for the URL request.
- Returns:
- `np.ndarray`: A numpy array representing the audio.
- """
- if isinstance(audio, str):
- # Try to load with `torchcodec` but do not enforce users to install it. If not found
- # fallback to `librosa`. If using an audio-only model, most probably `torchcodec` won't be
- # needed. Do not raise any errors if not installed or versions do not match
- if is_torchcodec_available() and version.parse("0.3.0") <= TORCHCODEC_VERSION:
- audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate)
- else:
- audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout)
- elif not isinstance(audio, np.ndarray):
- raise TypeError(
- "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
- )
- return audio
- def load_audio_torchcodec(audio: str | np.ndarray, sampling_rate=16000) -> np.ndarray:
- """
- Loads `audio` to an np.ndarray object using `torchcodec`.
- Args:
- audio (`str` or `np.ndarray`):
- The audio to be loaded to the numpy array format.
- sampling_rate (`int`, *optional*, defaults to 16000):
- The sampling rate to be used when loading the audio. It should be same as the
- sampling rate the model you will be using further was trained with.
- Returns:
- `np.ndarray`: A numpy array representing the audio.
- """
- # Lazy import so that issues in torchcodec compatibility don't crash the whole library
- requires_backends(load_audio_torchcodec, ["torchcodec"])
- from torchcodec.decoders import AudioDecoder
- # Set `num_channels` to `1` which is what most models expects and the default in librosa
- decoder = AudioDecoder(audio, sample_rate=sampling_rate, num_channels=1)
- audio = decoder.get_all_samples().data[0].numpy() # NOTE: feature extractors don't accept torch tensors
- return audio
- def load_audio_librosa(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
- """
- Loads `audio` to an np.ndarray object using `librosa`.
- Args:
- audio (`str` or `np.ndarray`):
- The audio to be loaded to the numpy array format.
- sampling_rate (`int`, *optional*, defaults to 16000):
- The sampling rate to be used when loading the audio. It should be same as the
- sampling rate the model you will be using further was trained with.
- timeout (`float`, *optional*):
- The timeout value in seconds for the URL request.
- Returns:
- `np.ndarray`: A numpy array representing the audio.
- """
- requires_backends(load_audio_librosa, ["librosa"])
- # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
- if audio.startswith("http://") or audio.startswith("https://"):
- audio = librosa.load(
- BytesIO(httpx.get(audio, follow_redirects=True, timeout=timeout).content), sr=sampling_rate
- )[0]
- elif os.path.isfile(audio):
- audio = librosa.load(audio, sr=sampling_rate)[0]
- return audio
- def load_audio_as(
- audio: str,
- return_format: str,
- timeout: int | None = None,
- force_mono: bool = False,
- sampling_rate: int | None = None,
- ) -> str | dict[str, Any] | io.BytesIO | None:
- """
- Load audio from either a local file path or URL and return in specified format.
- Args:
- audio (`str`): Either a local file path or a URL to an audio file
- return_format (`str`): Format to return the audio in:
- - "base64": Base64 encoded string
- - "dict": Dictionary with data and format
- - "buffer": BytesIO object
- timeout (`int`, *optional*): Timeout for URL requests in seconds
- force_mono (`bool`): Whether to convert stereo audio to mono
- sampling_rate (`int`, *optional*): If provided, the audio will be resampled to the specified sampling rate.
- Returns:
- `Union[str, Dict[str, Any], io.BytesIO, None]`:
- - `str`: Base64 encoded audio data (if return_format="base64")
- - `dict`: Dictionary with 'data' (base64 encoded audio data) and 'format' keys (if return_format="dict")
- - `io.BytesIO`: BytesIO object containing audio data (if return_format="buffer")
- """
- requires_backends(load_audio_as, ["librosa"])
- if return_format not in ["base64", "dict", "buffer"]:
- raise ValueError(f"Invalid return_format: {return_format}. Must be 'base64', 'dict', or 'buffer'")
- try:
- # Load audio bytes from URL or file
- audio_bytes = None
- if audio.startswith(("http://", "https://")):
- response = httpx.get(audio, follow_redirects=True, timeout=timeout)
- response.raise_for_status()
- audio_bytes = response.content
- elif os.path.isfile(audio):
- with open(audio, "rb") as audio_file:
- audio_bytes = audio_file.read()
- else:
- raise ValueError(f"File not found: {audio}")
- # Process audio data
- with io.BytesIO(audio_bytes) as audio_file:
- with sf.SoundFile(audio_file) as f:
- audio_array = f.read(dtype="float32")
- original_sr = f.samplerate
- audio_format = f.format
- if sampling_rate is not None and sampling_rate != original_sr:
- # Resample audio to target sampling rate
- audio_array = soxr.resample(audio_array, original_sr, sampling_rate, quality="HQ")
- else:
- sampling_rate = original_sr
- # Convert to mono if needed
- if force_mono and audio_array.ndim != 1:
- audio_array = audio_array.mean(axis=1)
- buffer = io.BytesIO()
- sf.write(buffer, audio_array, sampling_rate, format=audio_format.upper())
- buffer.seek(0)
- if return_format == "buffer":
- return buffer
- elif return_format == "base64":
- return base64.b64encode(buffer.read()).decode("utf-8")
- elif return_format == "dict":
- return {
- "data": base64.b64encode(buffer.read()).decode("utf-8"),
- "format": audio_format.lower(),
- }
- except Exception as e:
- raise ValueError(f"Error loading audio: {e}")
- def conv1d_output_length(module: "torch.nn.Conv1d", input_length: int) -> int:
- """
- Computes the output length of a 1D convolution layer according to torch's documentation:
- https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
- """
- return int(
- (input_length + 2 * module.padding[0] - module.dilation[0] * (module.kernel_size[0] - 1) - 1)
- / module.stride[0]
- + 1
- )
- def is_valid_audio(audio):
- return is_numpy_array(audio) or is_torch_tensor(audio)
- def is_valid_list_of_audio(audio):
- return audio and all(is_valid_audio(audio_i) for audio_i in audio)
- def make_list_of_audio(
- audio: list[AudioInput] | AudioInput,
- ) -> AudioInput:
- """
- Ensure that the output is a list of audio.
- Args:
- audio (`Union[list[AudioInput], AudioInput]`):
- The input audio.
- Returns:
- list: A list of audio.
- """
- # If it's a list of audios, it's already in the right format
- if isinstance(audio, (list, tuple)) and is_valid_list_of_audio(audio):
- return audio
- # If it's a single audio, convert it to a list of
- if is_valid_audio(audio):
- return [audio]
- raise ValueError("Invalid input type. Must be a single audio or a list of audio")
- def hertz_to_mel(freq: float | np.ndarray, mel_scale: str = "htk") -> float | np.ndarray:
- """
- Convert frequency from hertz to mels.
- Args:
- freq (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in hertz (Hz).
- mel_scale (`str`, *optional*, defaults to `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- Returns:
- `float` or `np.ndarray`: The frequencies on the mel scale.
- """
- if mel_scale not in ["slaney", "htk", "kaldi"]:
- raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
- if mel_scale == "htk":
- return 2595.0 * np.log10(1.0 + (freq / 700.0))
- elif mel_scale == "kaldi":
- return 1127.0 * np.log(1.0 + (freq / 700.0))
- min_log_hertz = 1000.0
- min_log_mel = 15.0
- logstep = 27.0 / np.log(6.4)
- mels = 3.0 * freq / 200.0
- if isinstance(freq, np.ndarray):
- log_region = freq >= min_log_hertz
- mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
- elif freq >= min_log_hertz:
- mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
- return mels
- def mel_to_hertz(mels: float | np.ndarray, mel_scale: str = "htk") -> float | np.ndarray:
- """
- Convert frequency from mels to hertz.
- Args:
- mels (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in mels.
- mel_scale (`str`, *optional*, `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- Returns:
- `float` or `np.ndarray`: The frequencies in hertz.
- """
- if mel_scale not in ["slaney", "htk", "kaldi"]:
- raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
- if mel_scale == "htk":
- return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
- elif mel_scale == "kaldi":
- return 700.0 * (np.exp(mels / 1127.0) - 1.0)
- min_log_hertz = 1000.0
- min_log_mel = 15.0
- logstep = np.log(6.4) / 27.0
- freq = 200.0 * mels / 3.0
- if isinstance(mels, np.ndarray):
- log_region = mels >= min_log_mel
- freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
- elif mels >= min_log_mel:
- freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
- return freq
- def hertz_to_octave(freq: float | np.ndarray, tuning: float = 0.0, bins_per_octave: int = 12):
- """
- Convert frequency from hertz to fractional octave numbers.
- Adapted from *librosa*.
- Args:
- freq (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in hertz (Hz).
- tuning (`float`, defaults to `0.`):
- Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave.
- bins_per_octave (`int`, defaults to `12`):
- Number of bins per octave.
- Returns:
- `float` or `np.ndarray`: The frequencies on the octave scale.
- """
- stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave)
- octave = np.log2(freq / (float(stuttgart_pitch) / 16))
- return octave
- def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
- """
- Creates a triangular filter bank.
- Adapted from *torchaudio* and *librosa*.
- Args:
- fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
- Discrete frequencies of the FFT bins in Hz.
- filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
- Center frequencies of the triangular filters to create, in Hz.
- Returns:
- `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
- """
- filter_diff = np.diff(filter_freqs)
- slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
- down_slopes = -slopes[:, :-2] / filter_diff[:-1]
- up_slopes = slopes[:, 2:] / filter_diff[1:]
- return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
- def chroma_filter_bank(
- num_frequency_bins: int,
- num_chroma: int,
- sampling_rate: int,
- tuning: float = 0.0,
- power: float | None = 2.0,
- weighting_parameters: tuple[float, float] | None = (5.0, 2.0),
- start_at_c_chroma: bool = True,
- ):
- """
- Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins.
- Adapted from *librosa*.
- Args:
- num_frequency_bins (`int`):
- Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
- num_chroma (`int`):
- Number of chroma bins (i.e pitch classes).
- sampling_rate (`float`):
- Sample rate of the audio waveform.
- tuning (`float`):
- Tuning deviation from A440 in fractions of a chroma bin.
- power (`float`, *optional*, defaults to 2.0):
- If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
- weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
- If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
- the second element being the Gaussian half-width.
- start_at_c_chroma (`bool`, *optional*, defaults to `True`):
- If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'.
- Returns:
- `np.ndarray` of shape `(num_frequency_bins, num_chroma)`
- """
- # Get the FFT bins, not counting the DC component
- frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:]
- freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma)
- # make up a value for the 0 Hz bin = 1.5 octaves below bin 1
- # (so chroma is 50% rotated from bin 1, and bin width is broad)
- freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins))
- bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1]))
- chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T
- num_chroma2 = np.round(float(num_chroma) / 2)
- # Project into range -num_chroma/2 .. num_chroma/2
- # add on fixed offset of 10*num_chroma to ensure all values passed to
- # rem are positive
- chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2
- # Gaussian bumps - 2*D to make them narrower
- chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2)
- # normalize each column
- if power is not None:
- chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power)
- # Maybe apply scaling for fft bins
- if weighting_parameters is not None:
- center, half_width = weighting_parameters
- chroma_filters *= np.tile(
- np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)),
- (num_chroma, 1),
- )
- if start_at_c_chroma:
- chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0)
- # remove aliasing columns, copy to ensure row-contiguity
- return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)])
- def mel_filter_bank(
- num_frequency_bins: int,
- num_mel_filters: int,
- min_frequency: float,
- max_frequency: float,
- sampling_rate: int,
- norm: str | None = None,
- mel_scale: str = "htk",
- triangularize_in_mel_space: bool = False,
- ) -> np.ndarray:
- """
- Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
- various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
- are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
- features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
- Different banks of mel filters were introduced in the literature. The following variations are supported:
- - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
- bandwidth of `[0, 4600]` Hz.
- - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
- bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
- - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
- speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
- - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
- 12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
- This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
- `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
- Args:
- num_frequency_bins (`int`):
- Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
- num_mel_filters (`int`):
- Number of mel filters to generate.
- min_frequency (`float`):
- Lowest frequency of interest in Hz.
- max_frequency (`float`):
- Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
- sampling_rate (`int`):
- Sample rate of the audio waveform.
- norm (`str`, *optional*):
- If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
- mel_scale (`str`, *optional*, defaults to `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- triangularize_in_mel_space (`bool`, *optional*, defaults to `False`):
- If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This
- should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
- Returns:
- `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
- projection matrix to go from a spectrogram to a mel spectrogram.
- """
- if norm is not None and norm != "slaney":
- raise ValueError('norm must be one of None or "slaney"')
- if num_frequency_bins < 2:
- raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")
- if min_frequency > max_frequency:
- raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")
- # center points of the triangular mel filters
- mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
- mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
- mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
- filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
- if triangularize_in_mel_space:
- # frequencies of FFT bins in Hz, but filters triangularized in mel space
- fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2)
- fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
- filter_freqs = mel_freqs
- else:
- # frequencies of FFT bins in Hz
- fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
- mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
- if norm is not None and norm == "slaney":
- # Slaney-style mel is scaled to be approx constant energy per channel
- enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
- mel_filters *= np.expand_dims(enorm, 0)
- if (mel_filters.max(axis=0) == 0.0).any():
- warnings.warn(
- "At least one mel filter has all zero values. "
- f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
- f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
- )
- return mel_filters
- def optimal_fft_length(window_length: int) -> int:
- """
- Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
- already a power of two, rounds it up to the next power or two.
- The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
- of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
- is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
- it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
- """
- return 2 ** int(np.ceil(np.log2(window_length)))
- def window_function(
- window_length: int,
- name: str = "hann",
- periodic: bool = True,
- frame_length: int | None = None,
- center: bool = True,
- ) -> np.ndarray:
- """
- Returns an array containing the specified window. This window is intended to be used with `stft`.
- The following window types are supported:
- - `"boxcar"`: a rectangular window
- - `"hamming"`: the Hamming window
- - `"hann"`: the Hann window
- - `"povey"`: the Povey window
- Args:
- window_length (`int`):
- The length of the window in samples.
- name (`str`, *optional*, defaults to `"hann"`):
- The name of the window function.
- periodic (`bool`, *optional*, defaults to `True`):
- Whether the window is periodic or symmetric.
- frame_length (`int`, *optional*):
- The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
- than the frame length, so that it will be zero-padded.
- center (`bool`, *optional*, defaults to `True`):
- Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
- Returns:
- `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
- """
- length = window_length + 1 if periodic else window_length
- if name == "boxcar":
- window = np.ones(length)
- elif name in ["hamming", "hamming_window"]:
- window = np.hamming(length)
- elif name in ["hann", "hann_window"]:
- window = np.hanning(length)
- elif name == "povey":
- window = np.power(np.hanning(length), 0.85)
- else:
- raise ValueError(f"Unknown window function '{name}'")
- if periodic:
- window = window[:-1]
- if frame_length is None:
- return window
- if window_length > frame_length:
- raise ValueError(
- f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
- )
- padded_window = np.zeros(frame_length)
- offset = (frame_length - window_length) // 2 if center else 0
- padded_window[offset : offset + window_length] = window
- return padded_window
- # Note: This method processes a single waveform. For batch processing, use spectrogram_batch().
- def spectrogram(
- waveform: np.ndarray,
- window: np.ndarray,
- frame_length: int,
- hop_length: int,
- fft_length: int | None = None,
- power: float | None = 1.0,
- center: bool = True,
- pad_mode: str = "reflect",
- onesided: bool = True,
- dither: float = 0.0,
- preemphasis: float | None = None,
- mel_filters: np.ndarray | None = None,
- mel_floor: float = 1e-10,
- log_mel: str | None = None,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: float | None = None,
- remove_dc_offset: bool = False,
- dtype: np.dtype = np.float32,
- ) -> np.ndarray:
- """
- Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
- This function can create the following kinds of spectrograms:
- - amplitude spectrogram (`power = 1.0`)
- - power spectrogram (`power = 2.0`)
- - complex-valued spectrogram (`power = None`)
- - log spectrogram (use `log_mel` argument)
- - mel spectrogram (provide `mel_filters`)
- - log-mel spectrogram (provide `mel_filters` and `log_mel`)
- How this works:
- 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
- - hop_length` samples.
- 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
- 3. The DFT is taken of each windowed frame.
- 4. The results are stacked into a spectrogram.
- We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
- - The analysis frame. This is the size of the time slices that the input waveform is split into.
- - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
- - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
- In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
- padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
- typically the next power of two.
- Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
- `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
- can be constructed.
- Args:
- waveform (`np.ndarray` of shape `(length,)`):
- The input waveform. This must be a single real-valued, mono waveform.
- window (`np.ndarray` of shape `(frame_length,)`):
- The windowing function to apply, including zero-padding if necessary. The actual window length may be
- shorter than `frame_length`, but we're assuming the array has already been zero-padded.
- frame_length (`int`):
- The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
- allow smaller sizes.
- hop_length (`int`):
- The stride between successive analysis frames in samples.
- fft_length (`int`, *optional*):
- The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
- For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
- power (`float`, *optional*, defaults to 1.0):
- If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
- complex numbers.
- center (`bool`, *optional*, defaults to `True`):
- Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
- `t` will start at time `t * hop_length`.
- pad_mode (`str`, *optional*, defaults to `"reflect"`):
- Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
- (pad with edge values), `"reflect"` (pads with mirrored values).
- onesided (`bool`, *optional*, defaults to `True`):
- If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
- frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
- dither (`float`, *optional*, defaults to 0.0):
- Adds dithering. In other words, adds a small Gaussian noise to each frame.
- E.g. use 4.0 to add dithering with a normal distribution centered
- around 0.0 with standard deviation 4.0, 0.0 means no dithering.
- Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
- values for signals with hard-zero sections, when VAD cutoff is present in the signal.
- preemphasis (`float`, *optional*)
- Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
- mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
- The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
- mel_floor (`float`, *optional*, defaults to 1e-10):
- Minimum value of mel frequency banks.
- log_mel (`str`, *optional*):
- How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
- the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
- used when `power` is not `None`.
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
- amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- remove_dc_offset (`bool`, *optional*):
- Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
- order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
- dtype (`np.dtype`, *optional*, defaults to `np.float32`):
- Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
- `np.complex64`.
- Returns:
- `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
- `(num_mel_filters, length)` for a mel spectrogram.
- """
- window_length = len(window)
- if fft_length is None:
- fft_length = frame_length
- if frame_length > fft_length:
- raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
- if window_length != frame_length:
- raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
- if hop_length <= 0:
- raise ValueError("hop_length must be greater than zero")
- if waveform.ndim != 1:
- raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
- if np.iscomplexobj(waveform):
- raise ValueError("Complex-valued input waveforms are not currently supported")
- if power is None and mel_filters is not None:
- raise ValueError(
- "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram."
- "Specify `power` to fix this issue."
- )
- # center pad the waveform
- if center:
- padding = [(int(frame_length // 2), int(frame_length // 2))]
- waveform = np.pad(waveform, padding, mode=pad_mode)
- # promote to float64, since np.fft uses float64 internally
- waveform = waveform.astype(np.float64)
- window = window.astype(np.float64)
- # split waveform into frames of frame_length size
- num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
- num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
- spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
- # rfft is faster than fft
- fft_func = np.fft.rfft if onesided else np.fft.fft
- buffer = np.zeros(fft_length)
- timestep = 0
- for frame_idx in range(num_frames):
- buffer[:frame_length] = waveform[timestep : timestep + frame_length]
- if dither != 0.0:
- buffer[:frame_length] += dither * np.random.randn(frame_length)
- if remove_dc_offset:
- buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
- if preemphasis is not None:
- buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
- buffer[0] *= 1 - preemphasis
- buffer[:frame_length] *= window
- spectrogram[frame_idx] = fft_func(buffer)
- timestep += hop_length
- # note: ** is much faster than np.power
- if power is not None:
- spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
- spectrogram = spectrogram.T
- if mel_filters is not None:
- spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
- if power is not None and log_mel is not None:
- if log_mel == "log":
- spectrogram = np.log(spectrogram)
- elif log_mel == "log10":
- spectrogram = np.log10(spectrogram)
- elif log_mel == "dB":
- if power == 1.0:
- spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
- elif power == 2.0:
- spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
- else:
- raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
- else:
- raise ValueError(f"Unknown log_mel option: {log_mel}")
- spectrogram = np.asarray(spectrogram, dtype)
- return spectrogram
- def spectrogram_batch(
- waveform_list: list[np.ndarray],
- window: np.ndarray,
- frame_length: int,
- hop_length: int,
- fft_length: int | None = None,
- power: float | None = 1.0,
- center: bool = True,
- pad_mode: str = "reflect",
- onesided: bool = True,
- dither: float = 0.0,
- preemphasis: float | None = None,
- mel_filters: np.ndarray | None = None,
- mel_floor: float = 1e-10,
- log_mel: str | None = None,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: float | None = None,
- remove_dc_offset: bool = False,
- dtype: np.dtype = np.float32,
- ) -> list[np.ndarray]:
- """
- Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
- This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
- It supports generating various types of spectrograms:
- - amplitude spectrogram (`power = 1.0`)
- - power spectrogram (`power = 2.0`)
- - complex-valued spectrogram (`power = None`)
- - log spectrogram (use `log_mel` argument)
- - mel spectrogram (provide `mel_filters`)
- - log-mel spectrogram (provide `mel_filters` and `log_mel`)
- How this works:
- 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
- - hop_length` samples.
- 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
- 3. The DFT is taken of each windowed frame.
- 4. The results are stacked into a spectrogram.
- We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
- - The analysis frame. This is the size of the time slices that the input waveform is split into.
- - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
- - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
- In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
- padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
- typically the next power of two.
- Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
- Args:
- waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
- The list of input waveforms, each a single-channel (mono) signal.
- window (`np.ndarray` of shape `(frame_length,)`):
- The windowing function to apply, including zero-padding if necessary.
- frame_length (`int`):
- The length of each frame for analysis.
- hop_length (`int`):
- The step size between successive frames.
- fft_length (`int`, *optional*):
- The size of the FFT buffer, defining frequency bin resolution.
- power (`float`, *optional*, defaults to 1.0):
- Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex.
- center (`bool`, *optional*, defaults to `True`):
- Whether to center-pad the waveform frames.
- pad_mode (`str`, *optional*, defaults to `"reflect"`):
- The padding strategy when `center` is `True`.
- onesided (`bool`, *optional*, defaults to `True`):
- If True, returns a one-sided spectrogram for real input signals.
- dither (`float`, *optional*, defaults to 0.0):
- Adds dithering. In other words, adds a small Gaussian noise to each frame.
- E.g. use 4.0 to add dithering with a normal distribution centered
- around 0.0 with standard deviation 4.0, 0.0 means no dithering.
- preemphasis (`float`, *optional*):
- Applies a pre-emphasis filter to each frame.
- mel_filters (`np.ndarray`, *optional*):
- Mel filter bank for converting to mel spectrogram.
- mel_floor (`float`, *optional*, defaults to 1e-10):
- Floor value for mel spectrogram to avoid log(0).
- log_mel (`str`, *optional*):
- Specifies log scaling strategy; options are None, "log", "log10", "dB".
- reference (`float`, *optional*, defaults to 1.0):
- Reference value for dB conversion in log_mel.
- min_value (`float`, *optional*, defaults to 1e-10):
- Minimum floor value for log scale conversions.
- db_range (`float`, *optional*):
- Dynamic range for dB scale spectrograms.
- remove_dc_offset (`bool`, *optional*):
- Whether to remove the DC offset from each frame.
- dtype (`np.dtype`, *optional*, defaults to `np.float32`):
- Data type of the output spectrogram.
- Returns:
- list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
- """
- window_length = len(window)
- if fft_length is None:
- fft_length = frame_length
- if frame_length > fft_length:
- raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
- if window_length != frame_length:
- raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
- if hop_length <= 0:
- raise ValueError("hop_length must be greater than zero")
- # Check the dimensions of the waveform , and if waveform is complex
- for waveform in waveform_list:
- if waveform.ndim != 1:
- raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
- if np.iscomplexobj(waveform):
- raise ValueError("Complex-valued input waveforms are not currently supported")
- # Center pad the waveform
- if center:
- padding = [(int(frame_length // 2), int(frame_length // 2))]
- waveform_list = [
- np.pad(
- waveform,
- padding,
- mode=pad_mode,
- )
- for waveform in waveform_list
- ]
- original_waveform_lengths = [
- len(waveform) for waveform in waveform_list
- ] # these lengths will be used to remove padding later
- # Batch pad the waveform
- max_length = max(original_waveform_lengths)
- padded_waveform_batch = np.array(
- [
- np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0)
- for waveform in waveform_list
- ],
- dtype=dtype,
- )
- # Promote to float64, since np.fft uses float64 internally
- padded_waveform_batch = padded_waveform_batch.astype(np.float64)
- window = window.astype(np.float64)
- # Split waveform into frames of frame_length size
- num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length))
- # these lengths will be used to remove padding later
- true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths]
- num_batches = padded_waveform_batch.shape[0]
- num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
- spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64)
- # rfft is faster than fft
- fft_func = np.fft.rfft if onesided else np.fft.fft
- buffer = np.zeros((num_batches, fft_length))
- for frame_idx in range(num_frames):
- timestep = frame_idx * hop_length
- buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
- if dither != 0.0:
- buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape)
- if remove_dc_offset:
- buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
- if preemphasis is not None:
- buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1]
- buffer[:, 0] *= 1 - preemphasis
- buffer[:, :frame_length] *= window
- spectrogram[:, frame_idx] = fft_func(buffer)
- # Note: ** is much faster than np.power
- if power is not None:
- spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
- # Apply mel filters if provided
- if mel_filters is not None:
- result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1]))
- spectrogram = np.maximum(mel_floor, result)
- # Convert to log scale if specified
- if power is not None and log_mel is not None:
- if log_mel == "log":
- spectrogram = np.log(spectrogram)
- elif log_mel == "log10":
- spectrogram = np.log10(spectrogram)
- elif log_mel == "dB":
- if power == 1.0:
- spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range)
- elif power == 2.0:
- spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range)
- else:
- raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
- else:
- raise ValueError(f"Unknown log_mel option: {log_mel}")
- spectrogram = np.asarray(spectrogram, dtype)
- spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))]
- return spectrogram_list
- def power_to_db(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: float | None = None,
- ) -> np.ndarray:
- """
- Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
- logarithm properties for numerical stability.
- The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
- linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
- This means that large variations in energy may not sound all that different if the sound is loud to begin with.
- This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
- Based on the implementation of `librosa.power_to_db`.
- Args:
- spectrogram (`np.ndarray`):
- The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the spectrogram in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
- return spectrogram
- def power_to_db_batch(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: float | None = None,
- ) -> np.ndarray:
- """
- Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
- using basic logarithm properties for numerical stability.
- This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram.
- Args:
- spectrogram (`np.ndarray`):
- The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
- Note that a power spectrogram has the amplitudes squared!
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the batch of spectrograms in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- # Apply db_range clipping per batch item
- max_values = spectrogram.max(axis=(1, 2), keepdims=True)
- spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
- return spectrogram
- def amplitude_to_db(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-5,
- db_range: float | None = None,
- ) -> np.ndarray:
- """
- Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
- basic logarithm properties for numerical stability.
- The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
- linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
- This means that large variations in energy may not sound all that different if the sound is loud to begin with.
- This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
- Args:
- spectrogram (`np.ndarray`):
- The input amplitude (mel) spectrogram.
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-5`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the spectrogram in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
- return spectrogram
- def amplitude_to_db_batch(
- spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: float | None = None
- ) -> np.ndarray:
- """
- Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
- using basic logarithm properties for numerical stability.
- The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram.
- Args:
- spectrogram (`np.ndarray`):
- The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-5`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the batch of spectrograms in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- # Apply db_range clipping per batch item
- max_values = spectrogram.max(axis=(1, 2), keepdims=True)
- spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
- return spectrogram
|