sessions.d.ts 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. import { APIResource } from "../../../resource.js";
  2. import * as Core from "../../../core.js";
  3. export declare class Sessions extends APIResource {
  4. /**
  5. * Create an ephemeral API token for use in client-side applications with the
  6. * Realtime API. Can be configured with the same session parameters as the
  7. * `session.update` client event.
  8. *
  9. * It responds with a session object, plus a `client_secret` key which contains a
  10. * usable ephemeral API token that can be used to authenticate browser clients for
  11. * the Realtime API.
  12. *
  13. * @example
  14. * ```ts
  15. * const session =
  16. * await client.beta.realtime.sessions.create();
  17. * ```
  18. */
  19. create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise<SessionCreateResponse>;
  20. }
  21. /**
  22. * Realtime session object configuration.
  23. */
  24. export interface Session {
  25. /**
  26. * Unique identifier for the session that looks like `sess_1234567890abcdef`.
  27. */
  28. id?: string;
  29. /**
  30. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
  31. * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
  32. * (mono), and little-endian byte order.
  33. */
  34. input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  35. /**
  36. * Configuration for input audio noise reduction. This can be set to `null` to turn
  37. * off. Noise reduction filters audio added to the input audio buffer before it is
  38. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  39. * detection accuracy (reducing false positives) and model performance by improving
  40. * perception of the input audio.
  41. */
  42. input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
  43. /**
  44. * Configuration for input audio transcription, defaults to off and can be set to
  45. * `null` to turn off once on. Input audio transcription is not native to the
  46. * model, since the model consumes audio directly. Transcription runs
  47. * asynchronously through
  48. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  49. * and should be treated as guidance of input audio content rather than precisely
  50. * what the model heard. The client can optionally set the language and prompt for
  51. * transcription, these offer additional guidance to the transcription service.
  52. */
  53. input_audio_transcription?: Session.InputAudioTranscription;
  54. /**
  55. * The default system instructions (i.e. system message) prepended to model calls.
  56. * This field allows the client to guide the model on desired responses. The model
  57. * can be instructed on response content and format, (e.g. "be extremely succinct",
  58. * "act friendly", "here are examples of good responses") and on audio behavior
  59. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  60. * instructions are not guaranteed to be followed by the model, but they provide
  61. * guidance to the model on the desired behavior.
  62. *
  63. * Note that the server sets default instructions which will be used if this field
  64. * is not set and are visible in the `session.created` event at the start of the
  65. * session.
  66. */
  67. instructions?: string;
  68. /**
  69. * Maximum number of output tokens for a single assistant response, inclusive of
  70. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  71. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  72. */
  73. max_response_output_tokens?: number | 'inf';
  74. /**
  75. * The set of modalities the model can respond with. To disable audio, set this to
  76. * ["text"].
  77. */
  78. modalities?: Array<'text' | 'audio'>;
  79. /**
  80. * The Realtime model used for this session.
  81. */
  82. model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
  83. /**
  84. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  85. * For `pcm16`, output audio is sampled at a rate of 24kHz.
  86. */
  87. output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  88. /**
  89. * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
  90. * temperature of 0.8 is highly recommended for best performance.
  91. */
  92. temperature?: number;
  93. /**
  94. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  95. * a function.
  96. */
  97. tool_choice?: string;
  98. /**
  99. * Tools (functions) available to the model.
  100. */
  101. tools?: Array<Session.Tool>;
  102. /**
  103. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  104. * set to `null` to turn off, in which case the client must manually trigger model
  105. * response. Server VAD means that the model will detect the start and end of
  106. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  107. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  108. * semantically estimate whether the user has finished speaking, then dynamically
  109. * sets a timeout based on this probability. For example, if user audio trails off
  110. * with "uhhm", the model will score a low probability of turn end and wait longer
  111. * for the user to continue speaking. This can be useful for more natural
  112. * conversations, but may have a higher latency.
  113. */
  114. turn_detection?: Session.TurnDetection;
  115. /**
  116. * The voice the model uses to respond. Voice cannot be changed during the session
  117. * once the model has responded with audio at least once. Current voice options are
  118. * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
  119. */
  120. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer' | 'verse';
  121. }
  122. export declare namespace Session {
  123. /**
  124. * Configuration for input audio noise reduction. This can be set to `null` to turn
  125. * off. Noise reduction filters audio added to the input audio buffer before it is
  126. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  127. * detection accuracy (reducing false positives) and model performance by improving
  128. * perception of the input audio.
  129. */
  130. interface InputAudioNoiseReduction {
  131. /**
  132. * Type of noise reduction. `near_field` is for close-talking microphones such as
  133. * headphones, `far_field` is for far-field microphones such as laptop or
  134. * conference room microphones.
  135. */
  136. type?: 'near_field' | 'far_field';
  137. }
  138. /**
  139. * Configuration for input audio transcription, defaults to off and can be set to
  140. * `null` to turn off once on. Input audio transcription is not native to the
  141. * model, since the model consumes audio directly. Transcription runs
  142. * asynchronously through
  143. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  144. * and should be treated as guidance of input audio content rather than precisely
  145. * what the model heard. The client can optionally set the language and prompt for
  146. * transcription, these offer additional guidance to the transcription service.
  147. */
  148. interface InputAudioTranscription {
  149. /**
  150. * The language of the input audio. Supplying the input language in
  151. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  152. * format will improve accuracy and latency.
  153. */
  154. language?: string;
  155. /**
  156. * The model to use for transcription, current options are `gpt-4o-transcribe`,
  157. * `gpt-4o-mini-transcribe`, and `whisper-1`.
  158. */
  159. model?: string;
  160. /**
  161. * An optional text to guide the model's style or continue a previous audio
  162. * segment. For `whisper-1`, the
  163. * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
  164. * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
  165. * "expect words related to technology".
  166. */
  167. prompt?: string;
  168. }
  169. interface Tool {
  170. /**
  171. * The description of the function, including guidance on when and how to call it,
  172. * and guidance about what to tell the user when calling (if anything).
  173. */
  174. description?: string;
  175. /**
  176. * The name of the function.
  177. */
  178. name?: string;
  179. /**
  180. * Parameters of the function in JSON Schema.
  181. */
  182. parameters?: unknown;
  183. /**
  184. * The type of the tool, i.e. `function`.
  185. */
  186. type?: 'function';
  187. }
  188. /**
  189. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  190. * set to `null` to turn off, in which case the client must manually trigger model
  191. * response. Server VAD means that the model will detect the start and end of
  192. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  193. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  194. * semantically estimate whether the user has finished speaking, then dynamically
  195. * sets a timeout based on this probability. For example, if user audio trails off
  196. * with "uhhm", the model will score a low probability of turn end and wait longer
  197. * for the user to continue speaking. This can be useful for more natural
  198. * conversations, but may have a higher latency.
  199. */
  200. interface TurnDetection {
  201. /**
  202. * Whether or not to automatically generate a response when a VAD stop event
  203. * occurs.
  204. */
  205. create_response?: boolean;
  206. /**
  207. * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
  208. * will wait longer for the user to continue speaking, `high` will respond more
  209. * quickly. `auto` is the default and is equivalent to `medium`.
  210. */
  211. eagerness?: 'low' | 'medium' | 'high' | 'auto';
  212. /**
  213. * Whether or not to automatically interrupt any ongoing response with output to
  214. * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
  215. * occurs.
  216. */
  217. interrupt_response?: boolean;
  218. /**
  219. * Used only for `server_vad` mode. Amount of audio to include before the VAD
  220. * detected speech (in milliseconds). Defaults to 300ms.
  221. */
  222. prefix_padding_ms?: number;
  223. /**
  224. * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
  225. * milliseconds). Defaults to 500ms. With shorter values the model will respond
  226. * more quickly, but may jump in on short pauses from the user.
  227. */
  228. silence_duration_ms?: number;
  229. /**
  230. * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
  231. * defaults to 0.5. A higher threshold will require louder audio to activate the
  232. * model, and thus might perform better in noisy environments.
  233. */
  234. threshold?: number;
  235. /**
  236. * Type of turn detection.
  237. */
  238. type?: 'server_vad' | 'semantic_vad';
  239. }
  240. }
  241. /**
  242. * A new Realtime session configuration, with an ephermeral key. Default TTL for
  243. * keys is one minute.
  244. */
  245. export interface SessionCreateResponse {
  246. /**
  247. * Ephemeral key returned by the API.
  248. */
  249. client_secret: SessionCreateResponse.ClientSecret;
  250. /**
  251. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  252. */
  253. input_audio_format?: string;
  254. /**
  255. * Configuration for input audio transcription, defaults to off and can be set to
  256. * `null` to turn off once on. Input audio transcription is not native to the
  257. * model, since the model consumes audio directly. Transcription runs
  258. * asynchronously through Whisper and should be treated as rough guidance rather
  259. * than the representation understood by the model.
  260. */
  261. input_audio_transcription?: SessionCreateResponse.InputAudioTranscription;
  262. /**
  263. * The default system instructions (i.e. system message) prepended to model calls.
  264. * This field allows the client to guide the model on desired responses. The model
  265. * can be instructed on response content and format, (e.g. "be extremely succinct",
  266. * "act friendly", "here are examples of good responses") and on audio behavior
  267. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  268. * instructions are not guaranteed to be followed by the model, but they provide
  269. * guidance to the model on the desired behavior.
  270. *
  271. * Note that the server sets default instructions which will be used if this field
  272. * is not set and are visible in the `session.created` event at the start of the
  273. * session.
  274. */
  275. instructions?: string;
  276. /**
  277. * Maximum number of output tokens for a single assistant response, inclusive of
  278. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  279. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  280. */
  281. max_response_output_tokens?: number | 'inf';
  282. /**
  283. * The set of modalities the model can respond with. To disable audio, set this to
  284. * ["text"].
  285. */
  286. modalities?: Array<'text' | 'audio'>;
  287. /**
  288. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  289. */
  290. output_audio_format?: string;
  291. /**
  292. * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
  293. */
  294. temperature?: number;
  295. /**
  296. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  297. * a function.
  298. */
  299. tool_choice?: string;
  300. /**
  301. * Tools (functions) available to the model.
  302. */
  303. tools?: Array<SessionCreateResponse.Tool>;
  304. /**
  305. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  306. * means that the model will detect the start and end of speech based on audio
  307. * volume and respond at the end of user speech.
  308. */
  309. turn_detection?: SessionCreateResponse.TurnDetection;
  310. /**
  311. * The voice the model uses to respond. Voice cannot be changed during the session
  312. * once the model has responded with audio at least once. Current voice options are
  313. * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
  314. */
  315. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer' | 'verse';
  316. }
  317. export declare namespace SessionCreateResponse {
  318. /**
  319. * Ephemeral key returned by the API.
  320. */
  321. interface ClientSecret {
  322. /**
  323. * Timestamp for when the token expires. Currently, all tokens expire after one
  324. * minute.
  325. */
  326. expires_at: number;
  327. /**
  328. * Ephemeral key usable in client environments to authenticate connections to the
  329. * Realtime API. Use this in client-side environments rather than a standard API
  330. * token, which should only be used server-side.
  331. */
  332. value: string;
  333. }
  334. /**
  335. * Configuration for input audio transcription, defaults to off and can be set to
  336. * `null` to turn off once on. Input audio transcription is not native to the
  337. * model, since the model consumes audio directly. Transcription runs
  338. * asynchronously through Whisper and should be treated as rough guidance rather
  339. * than the representation understood by the model.
  340. */
  341. interface InputAudioTranscription {
  342. /**
  343. * The model to use for transcription, `whisper-1` is the only currently supported
  344. * model.
  345. */
  346. model?: string;
  347. }
  348. interface Tool {
  349. /**
  350. * The description of the function, including guidance on when and how to call it,
  351. * and guidance about what to tell the user when calling (if anything).
  352. */
  353. description?: string;
  354. /**
  355. * The name of the function.
  356. */
  357. name?: string;
  358. /**
  359. * Parameters of the function in JSON Schema.
  360. */
  361. parameters?: unknown;
  362. /**
  363. * The type of the tool, i.e. `function`.
  364. */
  365. type?: 'function';
  366. }
  367. /**
  368. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  369. * means that the model will detect the start and end of speech based on audio
  370. * volume and respond at the end of user speech.
  371. */
  372. interface TurnDetection {
  373. /**
  374. * Amount of audio to include before the VAD detected speech (in milliseconds).
  375. * Defaults to 300ms.
  376. */
  377. prefix_padding_ms?: number;
  378. /**
  379. * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
  380. * With shorter values the model will respond more quickly, but may jump in on
  381. * short pauses from the user.
  382. */
  383. silence_duration_ms?: number;
  384. /**
  385. * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
  386. * threshold will require louder audio to activate the model, and thus might
  387. * perform better in noisy environments.
  388. */
  389. threshold?: number;
  390. /**
  391. * Type of turn detection, only `server_vad` is currently supported.
  392. */
  393. type?: string;
  394. }
  395. }
  396. export interface SessionCreateParams {
  397. /**
  398. * Configuration options for the generated client secret.
  399. */
  400. client_secret?: SessionCreateParams.ClientSecret;
  401. /**
  402. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
  403. * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
  404. * (mono), and little-endian byte order.
  405. */
  406. input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  407. /**
  408. * Configuration for input audio noise reduction. This can be set to `null` to turn
  409. * off. Noise reduction filters audio added to the input audio buffer before it is
  410. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  411. * detection accuracy (reducing false positives) and model performance by improving
  412. * perception of the input audio.
  413. */
  414. input_audio_noise_reduction?: SessionCreateParams.InputAudioNoiseReduction;
  415. /**
  416. * Configuration for input audio transcription, defaults to off and can be set to
  417. * `null` to turn off once on. Input audio transcription is not native to the
  418. * model, since the model consumes audio directly. Transcription runs
  419. * asynchronously through
  420. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  421. * and should be treated as guidance of input audio content rather than precisely
  422. * what the model heard. The client can optionally set the language and prompt for
  423. * transcription, these offer additional guidance to the transcription service.
  424. */
  425. input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
  426. /**
  427. * The default system instructions (i.e. system message) prepended to model calls.
  428. * This field allows the client to guide the model on desired responses. The model
  429. * can be instructed on response content and format, (e.g. "be extremely succinct",
  430. * "act friendly", "here are examples of good responses") and on audio behavior
  431. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  432. * instructions are not guaranteed to be followed by the model, but they provide
  433. * guidance to the model on the desired behavior.
  434. *
  435. * Note that the server sets default instructions which will be used if this field
  436. * is not set and are visible in the `session.created` event at the start of the
  437. * session.
  438. */
  439. instructions?: string;
  440. /**
  441. * Maximum number of output tokens for a single assistant response, inclusive of
  442. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  443. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  444. */
  445. max_response_output_tokens?: number | 'inf';
  446. /**
  447. * The set of modalities the model can respond with. To disable audio, set this to
  448. * ["text"].
  449. */
  450. modalities?: Array<'text' | 'audio'>;
  451. /**
  452. * The Realtime model used for this session.
  453. */
  454. model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
  455. /**
  456. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  457. * For `pcm16`, output audio is sampled at a rate of 24kHz.
  458. */
  459. output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  460. /**
  461. * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
  462. * temperature of 0.8 is highly recommended for best performance.
  463. */
  464. temperature?: number;
  465. /**
  466. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  467. * a function.
  468. */
  469. tool_choice?: string;
  470. /**
  471. * Tools (functions) available to the model.
  472. */
  473. tools?: Array<SessionCreateParams.Tool>;
  474. /**
  475. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  476. * set to `null` to turn off, in which case the client must manually trigger model
  477. * response. Server VAD means that the model will detect the start and end of
  478. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  479. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  480. * semantically estimate whether the user has finished speaking, then dynamically
  481. * sets a timeout based on this probability. For example, if user audio trails off
  482. * with "uhhm", the model will score a low probability of turn end and wait longer
  483. * for the user to continue speaking. This can be useful for more natural
  484. * conversations, but may have a higher latency.
  485. */
  486. turn_detection?: SessionCreateParams.TurnDetection;
  487. /**
  488. * The voice the model uses to respond. Voice cannot be changed during the session
  489. * once the model has responded with audio at least once. Current voice options are
  490. * `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
  491. * `shimmer`, and `verse`.
  492. */
  493. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer' | 'verse';
  494. }
  495. export declare namespace SessionCreateParams {
  496. /**
  497. * Configuration options for the generated client secret.
  498. */
  499. interface ClientSecret {
  500. /**
  501. * Configuration for the ephemeral token expiration.
  502. */
  503. expires_at?: ClientSecret.ExpiresAt;
  504. }
  505. namespace ClientSecret {
  506. /**
  507. * Configuration for the ephemeral token expiration.
  508. */
  509. interface ExpiresAt {
  510. /**
  511. * The anchor point for the ephemeral token expiration. Only `created_at` is
  512. * currently supported.
  513. */
  514. anchor?: 'created_at';
  515. /**
  516. * The number of seconds from the anchor point to the expiration. Select a value
  517. * between `10` and `7200`.
  518. */
  519. seconds?: number;
  520. }
  521. }
  522. /**
  523. * Configuration for input audio noise reduction. This can be set to `null` to turn
  524. * off. Noise reduction filters audio added to the input audio buffer before it is
  525. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  526. * detection accuracy (reducing false positives) and model performance by improving
  527. * perception of the input audio.
  528. */
  529. interface InputAudioNoiseReduction {
  530. /**
  531. * Type of noise reduction. `near_field` is for close-talking microphones such as
  532. * headphones, `far_field` is for far-field microphones such as laptop or
  533. * conference room microphones.
  534. */
  535. type?: 'near_field' | 'far_field';
  536. }
  537. /**
  538. * Configuration for input audio transcription, defaults to off and can be set to
  539. * `null` to turn off once on. Input audio transcription is not native to the
  540. * model, since the model consumes audio directly. Transcription runs
  541. * asynchronously through
  542. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  543. * and should be treated as guidance of input audio content rather than precisely
  544. * what the model heard. The client can optionally set the language and prompt for
  545. * transcription, these offer additional guidance to the transcription service.
  546. */
  547. interface InputAudioTranscription {
  548. /**
  549. * The language of the input audio. Supplying the input language in
  550. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  551. * format will improve accuracy and latency.
  552. */
  553. language?: string;
  554. /**
  555. * The model to use for transcription, current options are `gpt-4o-transcribe`,
  556. * `gpt-4o-mini-transcribe`, and `whisper-1`.
  557. */
  558. model?: string;
  559. /**
  560. * An optional text to guide the model's style or continue a previous audio
  561. * segment. For `whisper-1`, the
  562. * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
  563. * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
  564. * "expect words related to technology".
  565. */
  566. prompt?: string;
  567. }
  568. interface Tool {
  569. /**
  570. * The description of the function, including guidance on when and how to call it,
  571. * and guidance about what to tell the user when calling (if anything).
  572. */
  573. description?: string;
  574. /**
  575. * The name of the function.
  576. */
  577. name?: string;
  578. /**
  579. * Parameters of the function in JSON Schema.
  580. */
  581. parameters?: unknown;
  582. /**
  583. * The type of the tool, i.e. `function`.
  584. */
  585. type?: 'function';
  586. }
  587. /**
  588. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  589. * set to `null` to turn off, in which case the client must manually trigger model
  590. * response. Server VAD means that the model will detect the start and end of
  591. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  592. * is more advanced and uses a turn detection model (in conjuction with VAD) to
  593. * semantically estimate whether the user has finished speaking, then dynamically
  594. * sets a timeout based on this probability. For example, if user audio trails off
  595. * with "uhhm", the model will score a low probability of turn end and wait longer
  596. * for the user to continue speaking. This can be useful for more natural
  597. * conversations, but may have a higher latency.
  598. */
  599. interface TurnDetection {
  600. /**
  601. * Whether or not to automatically generate a response when a VAD stop event
  602. * occurs.
  603. */
  604. create_response?: boolean;
  605. /**
  606. * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
  607. * will wait longer for the user to continue speaking, `high` will respond more
  608. * quickly. `auto` is the default and is equivalent to `medium`.
  609. */
  610. eagerness?: 'low' | 'medium' | 'high' | 'auto';
  611. /**
  612. * Whether or not to automatically interrupt any ongoing response with output to
  613. * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
  614. * occurs.
  615. */
  616. interrupt_response?: boolean;
  617. /**
  618. * Used only for `server_vad` mode. Amount of audio to include before the VAD
  619. * detected speech (in milliseconds). Defaults to 300ms.
  620. */
  621. prefix_padding_ms?: number;
  622. /**
  623. * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
  624. * milliseconds). Defaults to 500ms. With shorter values the model will respond
  625. * more quickly, but may jump in on short pauses from the user.
  626. */
  627. silence_duration_ms?: number;
  628. /**
  629. * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
  630. * defaults to 0.5. A higher threshold will require louder audio to activate the
  631. * model, and thus might perform better in noisy environments.
  632. */
  633. threshold?: number;
  634. /**
  635. * Type of turn detection.
  636. */
  637. type?: 'server_vad' | 'semantic_vad';
  638. }
  639. }
  640. export declare namespace Sessions {
  641. export { type Session as Session, type SessionCreateResponse as SessionCreateResponse, type SessionCreateParams as SessionCreateParams, };
  642. }
  643. //# sourceMappingURL=sessions.d.ts.map