transcription-sessions.d.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import { APIResource } from "../../../resource.js";
  2. import * as Core from "../../../core.js";
  3. export declare class TranscriptionSessions extends APIResource {
  4. /**
  5. * Create an ephemeral API token for use in client-side applications with the
  6. * Realtime API specifically for realtime transcriptions. Can be configured with
  7. * the same session parameters as the `transcription_session.update` client event.
  8. *
  9. * It responds with a session object, plus a `client_secret` key which contains a
  10. * usable ephemeral API token that can be used to authenticate browser clients for
  11. * the Realtime API.
  12. *
  13. * @example
  14. * ```ts
  15. * const transcriptionSession =
  16. * await client.beta.realtime.transcriptionSessions.create();
  17. * ```
  18. */
  19. create(body: TranscriptionSessionCreateParams, options?: Core.RequestOptions): Core.APIPromise<TranscriptionSession>;
  20. }
  21. /**
  22. * A new Realtime transcription session configuration.
  23. *
  24. * When a session is created on the server via REST API, the session object also
  25. * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
  26. * not present when a session is updated via the WebSocket API.
  27. */
  28. export interface TranscriptionSession {
  29. /**
  30. * Ephemeral key returned by the API. Only present when the session is created on
  31. * the server via REST API.
  32. */
  33. client_secret: TranscriptionSession.ClientSecret;
  34. /**
  35. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  36. */
  37. input_audio_format?: string;
  38. /**
  39. * Configuration of the transcription model.
  40. */
  41. input_audio_transcription?: TranscriptionSession.InputAudioTranscription;
  42. /**
  43. * The set of modalities the model can respond with. To disable audio, set this to
  44. * ["text"].
  45. */
  46. modalities?: Array<'text' | 'audio'>;
  47. /**
  48. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  49. * means that the model will detect the start and end of speech based on audio
  50. * volume and respond at the end of user speech.
  51. */
  52. turn_detection?: TranscriptionSession.TurnDetection;
  53. }
  54. export declare namespace TranscriptionSession {
  55. /**
  56. * Ephemeral key returned by the API. Only present when the session is created on
  57. * the server via REST API.
  58. */
  59. interface ClientSecret {
  60. /**
  61. * Timestamp for when the token expires. Currently, all tokens expire after one
  62. * minute.
  63. */
  64. expires_at: number;
  65. /**
  66. * Ephemeral key usable in client environments to authenticate connections to the
  67. * Realtime API. Use this in client-side environments rather than a standard API
  68. * token, which should only be used server-side.
  69. */
  70. value: string;
  71. }
  72. /**
  73. * Configuration of the transcription model.
  74. */
  75. interface InputAudioTranscription {
  76. /**
  77. * The language of the input audio. Supplying the input language in
  78. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  79. * format will improve accuracy and latency.
  80. */
  81. language?: string;
  82. /**
  83. * The model to use for transcription. Can be `gpt-4o-transcribe`,
  84. * `gpt-4o-mini-transcribe`, or `whisper-1`.
  85. */
  86. model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
  87. /**
  88. * An optional text to guide the model's style or continue a previous audio
  89. * segment. The
  90. * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  91. * should match the audio language.
  92. */
  93. prompt?: string;
  94. }
  95. /**
  96. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  97. * means that the model will detect the start and end of speech based on audio
  98. * volume and respond at the end of user speech.
  99. */
  100. interface TurnDetection {
  101. /**
  102. * Amount of audio to include before the VAD detected speech (in milliseconds).
  103. * Defaults to 300ms.
  104. */
  105. prefix_padding_ms?: number;
  106. /**
  107. * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
  108. * With shorter values the model will respond more quickly, but may jump in on
  109. * short pauses from the user.
  110. */
  111. silence_duration_ms?: number;
  112. /**
  113. * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
  114. * threshold will require louder audio to activate the model, and thus might
  115. * perform better in noisy environments.
  116. */
  117. threshold?: number;
  118. /**
  119. * Type of turn detection, only `server_vad` is currently supported.
  120. */
  121. type?: string;
  122. }
  123. }
  124. export interface TranscriptionSessionCreateParams {
  125. /**
  126. * Configuration options for the generated client secret.
  127. */
  128. client_secret?: TranscriptionSessionCreateParams.ClientSecret;
  129. /**
  130. * The set of items to include in the transcription. Current available items are:
  131. *
  132. * - `item.input_audio_transcription.logprobs`
  133. */
  134. include?: Array<string>;
  135. /**
  136. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
  137. * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
  138. * (mono), and little-endian byte order.
  139. */
  140. input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  141. /**
  142. * Configuration for input audio noise reduction. This can be set to `null` to turn
  143. * off. Noise reduction filters audio added to the input audio buffer before it is
  144. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  145. * detection accuracy (reducing false positives) and model performance by improving
  146. * perception of the input audio.
  147. */
  148. input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
  149. /**
  150. * Configuration for input audio transcription. The client can optionally set the
  151. * language and prompt for transcription, these offer additional guidance to the
  152. * transcription service.
  153. */
  154. input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
  155. /**
  156. * The set of modalities the model can respond with. To disable audio, set this to
  157. * ["text"].
  158. */
  159. modalities?: Array<'text' | 'audio'>;
  160. /**
  161. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  162. * set to `null` to turn off, in which case the client must manually trigger model
  163. * response. Server VAD means that the model will detect the start and end of
  164. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  165. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  166. * semantically estimate whether the user has finished speaking, then dynamically
  167. * sets a timeout based on this probability. For example, if user audio trails off
  168. * with "uhhm", the model will score a low probability of turn end and wait longer
  169. * for the user to continue speaking. This can be useful for more natural
  170. * conversations, but may have a higher latency.
  171. */
  172. turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
  173. }
  174. export declare namespace TranscriptionSessionCreateParams {
  175. /**
  176. * Configuration options for the generated client secret.
  177. */
  178. interface ClientSecret {
  179. /**
  180. * Configuration for the ephemeral token expiration.
  181. */
  182. expires_at?: ClientSecret.ExpiresAt;
  183. }
  184. namespace ClientSecret {
  185. /**
  186. * Configuration for the ephemeral token expiration.
  187. */
  188. interface ExpiresAt {
  189. /**
  190. * The anchor point for the ephemeral token expiration. Only `created_at` is
  191. * currently supported.
  192. */
  193. anchor?: 'created_at';
  194. /**
  195. * The number of seconds from the anchor point to the expiration. Select a value
  196. * between `10` and `7200`.
  197. */
  198. seconds?: number;
  199. }
  200. }
  201. /**
  202. * Configuration for input audio noise reduction. This can be set to `null` to turn
  203. * off. Noise reduction filters audio added to the input audio buffer before it is
  204. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  205. * detection accuracy (reducing false positives) and model performance by improving
  206. * perception of the input audio.
  207. */
  208. interface InputAudioNoiseReduction {
  209. /**
  210. * Type of noise reduction. `near_field` is for close-talking microphones such as
  211. * headphones, `far_field` is for far-field microphones such as laptop or
  212. * conference room microphones.
  213. */
  214. type?: 'near_field' | 'far_field';
  215. }
  216. /**
  217. * Configuration for input audio transcription. The client can optionally set the
  218. * language and prompt for transcription, these offer additional guidance to the
  219. * transcription service.
  220. */
  221. interface InputAudioTranscription {
  222. /**
  223. * The language of the input audio. Supplying the input language in
  224. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  225. * format will improve accuracy and latency.
  226. */
  227. language?: string;
  228. /**
  229. * The model to use for transcription, current options are `gpt-4o-transcribe`,
  230. * `gpt-4o-mini-transcribe`, and `whisper-1`.
  231. */
  232. model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
  233. /**
  234. * An optional text to guide the model's style or continue a previous audio
  235. * segment. For `whisper-1`, the
  236. * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
  237. * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
  238. * "expect words related to technology".
  239. */
  240. prompt?: string;
  241. }
  242. /**
  243. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  244. * set to `null` to turn off, in which case the client must manually trigger model
  245. * response. Server VAD means that the model will detect the start and end of
  246. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  247. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  248. * semantically estimate whether the user has finished speaking, then dynamically
  249. * sets a timeout based on this probability. For example, if user audio trails off
  250. * with "uhhm", the model will score a low probability of turn end and wait longer
  251. * for the user to continue speaking. This can be useful for more natural
  252. * conversations, but may have a higher latency.
  253. */
  254. interface TurnDetection {
  255. /**
  256. * Whether or not to automatically generate a response when a VAD stop event
  257. * occurs. Not available for transcription sessions.
  258. */
  259. create_response?: boolean;
  260. /**
  261. * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
  262. * will wait longer for the user to continue speaking, `high` will respond more
  263. * quickly. `auto` is the default and is equivalent to `medium`.
  264. */
  265. eagerness?: 'low' | 'medium' | 'high' | 'auto';
  266. /**
  267. * Whether or not to automatically interrupt any ongoing response with output to
  268. * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
  269. * occurs. Not available for transcription sessions.
  270. */
  271. interrupt_response?: boolean;
  272. /**
  273. * Used only for `server_vad` mode. Amount of audio to include before the VAD
  274. * detected speech (in milliseconds). Defaults to 300ms.
  275. */
  276. prefix_padding_ms?: number;
  277. /**
  278. * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
  279. * milliseconds). Defaults to 500ms. With shorter values the model will respond
  280. * more quickly, but may jump in on short pauses from the user.
  281. */
  282. silence_duration_ms?: number;
  283. /**
  284. * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
  285. * defaults to 0.5. A higher threshold will require louder audio to activate the
  286. * model, and thus might perform better in noisy environments.
  287. */
  288. threshold?: number;
  289. /**
  290. * Type of turn detection.
  291. */
  292. type?: 'server_vad' | 'semantic_vad';
  293. }
  294. }
  295. export declare namespace TranscriptionSessions {
  296. export { type TranscriptionSession as TranscriptionSession, type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams, };
  297. }
  298. //# sourceMappingURL=transcription-sessions.d.ts.map