runs.d.ts 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794
  1. import { APIResource } from "../../../resource.js";
  2. import * as Core from "../../../core.js";
  3. import * as Shared from "../../shared.js";
  4. import * as ResponsesAPI from "../../responses/responses.js";
  5. import * as OutputItemsAPI from "./output-items.js";
  6. import { OutputItemListParams, OutputItemListResponse, OutputItemListResponsesPage, OutputItemRetrieveResponse, OutputItems } from "./output-items.js";
  7. import { CursorPage, type CursorPageParams } from "../../../pagination.js";
  8. export declare class Runs extends APIResource {
  9. outputItems: OutputItemsAPI.OutputItems;
  10. /**
  11. * Kicks off a new run for a given evaluation, specifying the data source, and what
  12. * model configuration to use to test. The datasource will be validated against the
  13. * schema specified in the config of the evaluation.
  14. */
  15. create(evalId: string, body: RunCreateParams, options?: Core.RequestOptions): Core.APIPromise<RunCreateResponse>;
  16. /**
  17. * Get an evaluation run by ID.
  18. */
  19. retrieve(evalId: string, runId: string, options?: Core.RequestOptions): Core.APIPromise<RunRetrieveResponse>;
  20. /**
  21. * Get a list of runs for an evaluation.
  22. */
  23. list(evalId: string, query?: RunListParams, options?: Core.RequestOptions): Core.PagePromise<RunListResponsesPage, RunListResponse>;
  24. list(evalId: string, options?: Core.RequestOptions): Core.PagePromise<RunListResponsesPage, RunListResponse>;
  25. /**
  26. * Delete an eval run.
  27. */
  28. del(evalId: string, runId: string, options?: Core.RequestOptions): Core.APIPromise<RunDeleteResponse>;
  29. /**
  30. * Cancel an ongoing evaluation run.
  31. */
  32. cancel(evalId: string, runId: string, options?: Core.RequestOptions): Core.APIPromise<RunCancelResponse>;
  33. }
  34. export declare class RunListResponsesPage extends CursorPage<RunListResponse> {
  35. }
  36. /**
  37. * A CompletionsRunDataSource object describing a model sampling configuration.
  38. */
  39. export interface CreateEvalCompletionsRunDataSource {
  40. /**
  41. * Determines what populates the `item` namespace in this run's data source.
  42. */
  43. source: CreateEvalCompletionsRunDataSource.FileContent | CreateEvalCompletionsRunDataSource.FileID | CreateEvalCompletionsRunDataSource.StoredCompletions;
  44. /**
  45. * The type of run data source. Always `completions`.
  46. */
  47. type: 'completions';
  48. /**
  49. * Used when sampling from a model. Dictates the structure of the messages passed
  50. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  51. * `item.input_trajectory`), or a template with variable references to the `item`
  52. * namespace.
  53. */
  54. input_messages?: CreateEvalCompletionsRunDataSource.Template | CreateEvalCompletionsRunDataSource.ItemReference;
  55. /**
  56. * The name of the model to use for generating completions (e.g. "o3-mini").
  57. */
  58. model?: string;
  59. sampling_params?: CreateEvalCompletionsRunDataSource.SamplingParams;
  60. }
  61. export declare namespace CreateEvalCompletionsRunDataSource {
  62. interface FileContent {
  63. /**
  64. * The content of the jsonl file.
  65. */
  66. content: Array<FileContent.Content>;
  67. /**
  68. * The type of jsonl source. Always `file_content`.
  69. */
  70. type: 'file_content';
  71. }
  72. namespace FileContent {
  73. interface Content {
  74. item: Record<string, unknown>;
  75. sample?: Record<string, unknown>;
  76. }
  77. }
  78. interface FileID {
  79. /**
  80. * The identifier of the file.
  81. */
  82. id: string;
  83. /**
  84. * The type of jsonl source. Always `file_id`.
  85. */
  86. type: 'file_id';
  87. }
  88. /**
  89. * A StoredCompletionsRunDataSource configuration describing a set of filters
  90. */
  91. interface StoredCompletions {
  92. /**
  93. * The type of source. Always `stored_completions`.
  94. */
  95. type: 'stored_completions';
  96. /**
  97. * An optional Unix timestamp to filter items created after this time.
  98. */
  99. created_after?: number | null;
  100. /**
  101. * An optional Unix timestamp to filter items created before this time.
  102. */
  103. created_before?: number | null;
  104. /**
  105. * An optional maximum number of items to return.
  106. */
  107. limit?: number | null;
  108. /**
  109. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  110. * for storing additional information about the object in a structured format, and
  111. * querying for objects via API or the dashboard.
  112. *
  113. * Keys are strings with a maximum length of 64 characters. Values are strings with
  114. * a maximum length of 512 characters.
  115. */
  116. metadata?: Shared.Metadata | null;
  117. /**
  118. * An optional model to filter by (e.g., 'gpt-4o').
  119. */
  120. model?: string | null;
  121. }
  122. interface Template {
  123. /**
  124. * A list of chat messages forming the prompt or context. May include variable
  125. * references to the `item` namespace, ie {{item.name}}.
  126. */
  127. template: Array<ResponsesAPI.EasyInputMessage | Template.Message>;
  128. /**
  129. * The type of input messages. Always `template`.
  130. */
  131. type: 'template';
  132. }
  133. namespace Template {
  134. /**
  135. * A message input to the model with a role indicating instruction following
  136. * hierarchy. Instructions given with the `developer` or `system` role take
  137. * precedence over instructions given with the `user` role. Messages with the
  138. * `assistant` role are presumed to have been generated by the model in previous
  139. * interactions.
  140. */
  141. interface Message {
  142. /**
  143. * Text inputs to the model - can contain template strings.
  144. */
  145. content: string | ResponsesAPI.ResponseInputText | Message.OutputText;
  146. /**
  147. * The role of the message input. One of `user`, `assistant`, `system`, or
  148. * `developer`.
  149. */
  150. role: 'user' | 'assistant' | 'system' | 'developer';
  151. /**
  152. * The type of the message input. Always `message`.
  153. */
  154. type?: 'message';
  155. }
  156. namespace Message {
  157. /**
  158. * A text output from the model.
  159. */
  160. interface OutputText {
  161. /**
  162. * The text output from the model.
  163. */
  164. text: string;
  165. /**
  166. * The type of the output text. Always `output_text`.
  167. */
  168. type: 'output_text';
  169. }
  170. }
  171. }
  172. interface ItemReference {
  173. /**
  174. * A reference to a variable in the `item` namespace. Ie, "item.input_trajectory"
  175. */
  176. item_reference: string;
  177. /**
  178. * The type of input messages. Always `item_reference`.
  179. */
  180. type: 'item_reference';
  181. }
  182. interface SamplingParams {
  183. /**
  184. * The maximum number of tokens in the generated output.
  185. */
  186. max_completion_tokens?: number;
  187. /**
  188. * A seed value to initialize the randomness, during sampling.
  189. */
  190. seed?: number;
  191. /**
  192. * A higher temperature increases randomness in the outputs.
  193. */
  194. temperature?: number;
  195. /**
  196. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  197. */
  198. top_p?: number;
  199. }
  200. }
  201. /**
  202. * A JsonlRunDataSource object with that specifies a JSONL file that matches the
  203. * eval
  204. */
  205. export interface CreateEvalJSONLRunDataSource {
  206. /**
  207. * Determines what populates the `item` namespace in the data source.
  208. */
  209. source: CreateEvalJSONLRunDataSource.FileContent | CreateEvalJSONLRunDataSource.FileID;
  210. /**
  211. * The type of data source. Always `jsonl`.
  212. */
  213. type: 'jsonl';
  214. }
  215. export declare namespace CreateEvalJSONLRunDataSource {
  216. interface FileContent {
  217. /**
  218. * The content of the jsonl file.
  219. */
  220. content: Array<FileContent.Content>;
  221. /**
  222. * The type of jsonl source. Always `file_content`.
  223. */
  224. type: 'file_content';
  225. }
  226. namespace FileContent {
  227. interface Content {
  228. item: Record<string, unknown>;
  229. sample?: Record<string, unknown>;
  230. }
  231. }
  232. interface FileID {
  233. /**
  234. * The identifier of the file.
  235. */
  236. id: string;
  237. /**
  238. * The type of jsonl source. Always `file_id`.
  239. */
  240. type: 'file_id';
  241. }
  242. }
  243. /**
  244. * An object representing an error response from the Eval API.
  245. */
  246. export interface EvalAPIError {
  247. /**
  248. * The error code.
  249. */
  250. code: string;
  251. /**
  252. * The error message.
  253. */
  254. message: string;
  255. }
  256. /**
  257. * A schema representing an evaluation run.
  258. */
  259. export interface RunCreateResponse {
  260. /**
  261. * Unique identifier for the evaluation run.
  262. */
  263. id: string;
  264. /**
  265. * Unix timestamp (in seconds) when the evaluation run was created.
  266. */
  267. created_at: number;
  268. /**
  269. * Information about the run's data source.
  270. */
  271. data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource | RunCreateResponse.Responses;
  272. /**
  273. * An object representing an error response from the Eval API.
  274. */
  275. error: EvalAPIError;
  276. /**
  277. * The identifier of the associated evaluation.
  278. */
  279. eval_id: string;
  280. /**
  281. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  282. * for storing additional information about the object in a structured format, and
  283. * querying for objects via API or the dashboard.
  284. *
  285. * Keys are strings with a maximum length of 64 characters. Values are strings with
  286. * a maximum length of 512 characters.
  287. */
  288. metadata: Shared.Metadata | null;
  289. /**
  290. * The model that is evaluated, if applicable.
  291. */
  292. model: string;
  293. /**
  294. * The name of the evaluation run.
  295. */
  296. name: string;
  297. /**
  298. * The type of the object. Always "eval.run".
  299. */
  300. object: 'eval.run';
  301. /**
  302. * Usage statistics for each model during the evaluation run.
  303. */
  304. per_model_usage: Array<RunCreateResponse.PerModelUsage>;
  305. /**
  306. * Results per testing criteria applied during the evaluation run.
  307. */
  308. per_testing_criteria_results: Array<RunCreateResponse.PerTestingCriteriaResult>;
  309. /**
  310. * The URL to the rendered evaluation run report on the UI dashboard.
  311. */
  312. report_url: string;
  313. /**
  314. * Counters summarizing the outcomes of the evaluation run.
  315. */
  316. result_counts: RunCreateResponse.ResultCounts;
  317. /**
  318. * The status of the evaluation run.
  319. */
  320. status: string;
  321. }
  322. export declare namespace RunCreateResponse {
  323. /**
  324. * A ResponsesRunDataSource object describing a model sampling configuration.
  325. */
  326. interface Responses {
  327. /**
  328. * Determines what populates the `item` namespace in this run's data source.
  329. */
  330. source: Responses.FileContent | Responses.FileID | Responses.Responses;
  331. /**
  332. * The type of run data source. Always `responses`.
  333. */
  334. type: 'responses';
  335. /**
  336. * Used when sampling from a model. Dictates the structure of the messages passed
  337. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  338. * `item.input_trajectory`), or a template with variable references to the `item`
  339. * namespace.
  340. */
  341. input_messages?: Responses.Template | Responses.ItemReference;
  342. /**
  343. * The name of the model to use for generating completions (e.g. "o3-mini").
  344. */
  345. model?: string;
  346. sampling_params?: Responses.SamplingParams;
  347. }
  348. namespace Responses {
  349. interface FileContent {
  350. /**
  351. * The content of the jsonl file.
  352. */
  353. content: Array<FileContent.Content>;
  354. /**
  355. * The type of jsonl source. Always `file_content`.
  356. */
  357. type: 'file_content';
  358. }
  359. namespace FileContent {
  360. interface Content {
  361. item: Record<string, unknown>;
  362. sample?: Record<string, unknown>;
  363. }
  364. }
  365. interface FileID {
  366. /**
  367. * The identifier of the file.
  368. */
  369. id: string;
  370. /**
  371. * The type of jsonl source. Always `file_id`.
  372. */
  373. type: 'file_id';
  374. }
  375. /**
  376. * A EvalResponsesSource object describing a run data source configuration.
  377. */
  378. interface Responses {
  379. /**
  380. * The type of run data source. Always `responses`.
  381. */
  382. type: 'responses';
  383. /**
  384. * Only include items created after this timestamp (inclusive). This is a query
  385. * parameter used to select responses.
  386. */
  387. created_after?: number | null;
  388. /**
  389. * Only include items created before this timestamp (inclusive). This is a query
  390. * parameter used to select responses.
  391. */
  392. created_before?: number | null;
  393. /**
  394. * Optional string to search the 'instructions' field. This is a query parameter
  395. * used to select responses.
  396. */
  397. instructions_search?: string | null;
  398. /**
  399. * Metadata filter for the responses. This is a query parameter used to select
  400. * responses.
  401. */
  402. metadata?: unknown | null;
  403. /**
  404. * The name of the model to find responses for. This is a query parameter used to
  405. * select responses.
  406. */
  407. model?: string | null;
  408. /**
  409. * Optional reasoning effort parameter. This is a query parameter used to select
  410. * responses.
  411. */
  412. reasoning_effort?: Shared.ReasoningEffort | null;
  413. /**
  414. * Sampling temperature. This is a query parameter used to select responses.
  415. */
  416. temperature?: number | null;
  417. /**
  418. * List of tool names. This is a query parameter used to select responses.
  419. */
  420. tools?: Array<string> | null;
  421. /**
  422. * Nucleus sampling parameter. This is a query parameter used to select responses.
  423. */
  424. top_p?: number | null;
  425. /**
  426. * List of user identifiers. This is a query parameter used to select responses.
  427. */
  428. users?: Array<string> | null;
  429. }
  430. interface Template {
  431. /**
  432. * A list of chat messages forming the prompt or context. May include variable
  433. * references to the `item` namespace, ie {{item.name}}.
  434. */
  435. template: Array<Template.ChatMessage | Template.EvalItem>;
  436. /**
  437. * The type of input messages. Always `template`.
  438. */
  439. type: 'template';
  440. }
  441. namespace Template {
  442. interface ChatMessage {
  443. /**
  444. * The content of the message.
  445. */
  446. content: string;
  447. /**
  448. * The role of the message (e.g. "system", "assistant", "user").
  449. */
  450. role: string;
  451. }
  452. /**
  453. * A message input to the model with a role indicating instruction following
  454. * hierarchy. Instructions given with the `developer` or `system` role take
  455. * precedence over instructions given with the `user` role. Messages with the
  456. * `assistant` role are presumed to have been generated by the model in previous
  457. * interactions.
  458. */
  459. interface EvalItem {
  460. /**
  461. * Text inputs to the model - can contain template strings.
  462. */
  463. content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
  464. /**
  465. * The role of the message input. One of `user`, `assistant`, `system`, or
  466. * `developer`.
  467. */
  468. role: 'user' | 'assistant' | 'system' | 'developer';
  469. /**
  470. * The type of the message input. Always `message`.
  471. */
  472. type?: 'message';
  473. }
  474. namespace EvalItem {
  475. /**
  476. * A text output from the model.
  477. */
  478. interface OutputText {
  479. /**
  480. * The text output from the model.
  481. */
  482. text: string;
  483. /**
  484. * The type of the output text. Always `output_text`.
  485. */
  486. type: 'output_text';
  487. }
  488. }
  489. }
  490. interface ItemReference {
  491. /**
  492. * A reference to a variable in the `item` namespace. Ie, "item.name"
  493. */
  494. item_reference: string;
  495. /**
  496. * The type of input messages. Always `item_reference`.
  497. */
  498. type: 'item_reference';
  499. }
  500. interface SamplingParams {
  501. /**
  502. * The maximum number of tokens in the generated output.
  503. */
  504. max_completion_tokens?: number;
  505. /**
  506. * A seed value to initialize the randomness, during sampling.
  507. */
  508. seed?: number;
  509. /**
  510. * A higher temperature increases randomness in the outputs.
  511. */
  512. temperature?: number;
  513. /**
  514. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  515. */
  516. top_p?: number;
  517. }
  518. }
  519. interface PerModelUsage {
  520. /**
  521. * The number of tokens retrieved from cache.
  522. */
  523. cached_tokens: number;
  524. /**
  525. * The number of completion tokens generated.
  526. */
  527. completion_tokens: number;
  528. /**
  529. * The number of invocations.
  530. */
  531. invocation_count: number;
  532. /**
  533. * The name of the model.
  534. */
  535. model_name: string;
  536. /**
  537. * The number of prompt tokens used.
  538. */
  539. prompt_tokens: number;
  540. /**
  541. * The total number of tokens used.
  542. */
  543. total_tokens: number;
  544. }
  545. interface PerTestingCriteriaResult {
  546. /**
  547. * Number of tests failed for this criteria.
  548. */
  549. failed: number;
  550. /**
  551. * Number of tests passed for this criteria.
  552. */
  553. passed: number;
  554. /**
  555. * A description of the testing criteria.
  556. */
  557. testing_criteria: string;
  558. }
  559. /**
  560. * Counters summarizing the outcomes of the evaluation run.
  561. */
  562. interface ResultCounts {
  563. /**
  564. * Number of output items that resulted in an error.
  565. */
  566. errored: number;
  567. /**
  568. * Number of output items that failed to pass the evaluation.
  569. */
  570. failed: number;
  571. /**
  572. * Number of output items that passed the evaluation.
  573. */
  574. passed: number;
  575. /**
  576. * Total number of executed output items.
  577. */
  578. total: number;
  579. }
  580. }
  581. /**
  582. * A schema representing an evaluation run.
  583. */
  584. export interface RunRetrieveResponse {
  585. /**
  586. * Unique identifier for the evaluation run.
  587. */
  588. id: string;
  589. /**
  590. * Unix timestamp (in seconds) when the evaluation run was created.
  591. */
  592. created_at: number;
  593. /**
  594. * Information about the run's data source.
  595. */
  596. data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource | RunRetrieveResponse.Responses;
  597. /**
  598. * An object representing an error response from the Eval API.
  599. */
  600. error: EvalAPIError;
  601. /**
  602. * The identifier of the associated evaluation.
  603. */
  604. eval_id: string;
  605. /**
  606. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  607. * for storing additional information about the object in a structured format, and
  608. * querying for objects via API or the dashboard.
  609. *
  610. * Keys are strings with a maximum length of 64 characters. Values are strings with
  611. * a maximum length of 512 characters.
  612. */
  613. metadata: Shared.Metadata | null;
  614. /**
  615. * The model that is evaluated, if applicable.
  616. */
  617. model: string;
  618. /**
  619. * The name of the evaluation run.
  620. */
  621. name: string;
  622. /**
  623. * The type of the object. Always "eval.run".
  624. */
  625. object: 'eval.run';
  626. /**
  627. * Usage statistics for each model during the evaluation run.
  628. */
  629. per_model_usage: Array<RunRetrieveResponse.PerModelUsage>;
  630. /**
  631. * Results per testing criteria applied during the evaluation run.
  632. */
  633. per_testing_criteria_results: Array<RunRetrieveResponse.PerTestingCriteriaResult>;
  634. /**
  635. * The URL to the rendered evaluation run report on the UI dashboard.
  636. */
  637. report_url: string;
  638. /**
  639. * Counters summarizing the outcomes of the evaluation run.
  640. */
  641. result_counts: RunRetrieveResponse.ResultCounts;
  642. /**
  643. * The status of the evaluation run.
  644. */
  645. status: string;
  646. }
  647. export declare namespace RunRetrieveResponse {
  648. /**
  649. * A ResponsesRunDataSource object describing a model sampling configuration.
  650. */
  651. interface Responses {
  652. /**
  653. * Determines what populates the `item` namespace in this run's data source.
  654. */
  655. source: Responses.FileContent | Responses.FileID | Responses.Responses;
  656. /**
  657. * The type of run data source. Always `responses`.
  658. */
  659. type: 'responses';
  660. /**
  661. * Used when sampling from a model. Dictates the structure of the messages passed
  662. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  663. * `item.input_trajectory`), or a template with variable references to the `item`
  664. * namespace.
  665. */
  666. input_messages?: Responses.Template | Responses.ItemReference;
  667. /**
  668. * The name of the model to use for generating completions (e.g. "o3-mini").
  669. */
  670. model?: string;
  671. sampling_params?: Responses.SamplingParams;
  672. }
  673. namespace Responses {
  674. interface FileContent {
  675. /**
  676. * The content of the jsonl file.
  677. */
  678. content: Array<FileContent.Content>;
  679. /**
  680. * The type of jsonl source. Always `file_content`.
  681. */
  682. type: 'file_content';
  683. }
  684. namespace FileContent {
  685. interface Content {
  686. item: Record<string, unknown>;
  687. sample?: Record<string, unknown>;
  688. }
  689. }
  690. interface FileID {
  691. /**
  692. * The identifier of the file.
  693. */
  694. id: string;
  695. /**
  696. * The type of jsonl source. Always `file_id`.
  697. */
  698. type: 'file_id';
  699. }
  700. /**
  701. * A EvalResponsesSource object describing a run data source configuration.
  702. */
  703. interface Responses {
  704. /**
  705. * The type of run data source. Always `responses`.
  706. */
  707. type: 'responses';
  708. /**
  709. * Only include items created after this timestamp (inclusive). This is a query
  710. * parameter used to select responses.
  711. */
  712. created_after?: number | null;
  713. /**
  714. * Only include items created before this timestamp (inclusive). This is a query
  715. * parameter used to select responses.
  716. */
  717. created_before?: number | null;
  718. /**
  719. * Optional string to search the 'instructions' field. This is a query parameter
  720. * used to select responses.
  721. */
  722. instructions_search?: string | null;
  723. /**
  724. * Metadata filter for the responses. This is a query parameter used to select
  725. * responses.
  726. */
  727. metadata?: unknown | null;
  728. /**
  729. * The name of the model to find responses for. This is a query parameter used to
  730. * select responses.
  731. */
  732. model?: string | null;
  733. /**
  734. * Optional reasoning effort parameter. This is a query parameter used to select
  735. * responses.
  736. */
  737. reasoning_effort?: Shared.ReasoningEffort | null;
  738. /**
  739. * Sampling temperature. This is a query parameter used to select responses.
  740. */
  741. temperature?: number | null;
  742. /**
  743. * List of tool names. This is a query parameter used to select responses.
  744. */
  745. tools?: Array<string> | null;
  746. /**
  747. * Nucleus sampling parameter. This is a query parameter used to select responses.
  748. */
  749. top_p?: number | null;
  750. /**
  751. * List of user identifiers. This is a query parameter used to select responses.
  752. */
  753. users?: Array<string> | null;
  754. }
  755. interface Template {
  756. /**
  757. * A list of chat messages forming the prompt or context. May include variable
  758. * references to the `item` namespace, ie {{item.name}}.
  759. */
  760. template: Array<Template.ChatMessage | Template.EvalItem>;
  761. /**
  762. * The type of input messages. Always `template`.
  763. */
  764. type: 'template';
  765. }
  766. namespace Template {
  767. interface ChatMessage {
  768. /**
  769. * The content of the message.
  770. */
  771. content: string;
  772. /**
  773. * The role of the message (e.g. "system", "assistant", "user").
  774. */
  775. role: string;
  776. }
  777. /**
  778. * A message input to the model with a role indicating instruction following
  779. * hierarchy. Instructions given with the `developer` or `system` role take
  780. * precedence over instructions given with the `user` role. Messages with the
  781. * `assistant` role are presumed to have been generated by the model in previous
  782. * interactions.
  783. */
  784. interface EvalItem {
  785. /**
  786. * Text inputs to the model - can contain template strings.
  787. */
  788. content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
  789. /**
  790. * The role of the message input. One of `user`, `assistant`, `system`, or
  791. * `developer`.
  792. */
  793. role: 'user' | 'assistant' | 'system' | 'developer';
  794. /**
  795. * The type of the message input. Always `message`.
  796. */
  797. type?: 'message';
  798. }
  799. namespace EvalItem {
  800. /**
  801. * A text output from the model.
  802. */
  803. interface OutputText {
  804. /**
  805. * The text output from the model.
  806. */
  807. text: string;
  808. /**
  809. * The type of the output text. Always `output_text`.
  810. */
  811. type: 'output_text';
  812. }
  813. }
  814. }
  815. interface ItemReference {
  816. /**
  817. * A reference to a variable in the `item` namespace. Ie, "item.name"
  818. */
  819. item_reference: string;
  820. /**
  821. * The type of input messages. Always `item_reference`.
  822. */
  823. type: 'item_reference';
  824. }
  825. interface SamplingParams {
  826. /**
  827. * The maximum number of tokens in the generated output.
  828. */
  829. max_completion_tokens?: number;
  830. /**
  831. * A seed value to initialize the randomness, during sampling.
  832. */
  833. seed?: number;
  834. /**
  835. * A higher temperature increases randomness in the outputs.
  836. */
  837. temperature?: number;
  838. /**
  839. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  840. */
  841. top_p?: number;
  842. }
  843. }
  844. interface PerModelUsage {
  845. /**
  846. * The number of tokens retrieved from cache.
  847. */
  848. cached_tokens: number;
  849. /**
  850. * The number of completion tokens generated.
  851. */
  852. completion_tokens: number;
  853. /**
  854. * The number of invocations.
  855. */
  856. invocation_count: number;
  857. /**
  858. * The name of the model.
  859. */
  860. model_name: string;
  861. /**
  862. * The number of prompt tokens used.
  863. */
  864. prompt_tokens: number;
  865. /**
  866. * The total number of tokens used.
  867. */
  868. total_tokens: number;
  869. }
  870. interface PerTestingCriteriaResult {
  871. /**
  872. * Number of tests failed for this criteria.
  873. */
  874. failed: number;
  875. /**
  876. * Number of tests passed for this criteria.
  877. */
  878. passed: number;
  879. /**
  880. * A description of the testing criteria.
  881. */
  882. testing_criteria: string;
  883. }
  884. /**
  885. * Counters summarizing the outcomes of the evaluation run.
  886. */
  887. interface ResultCounts {
  888. /**
  889. * Number of output items that resulted in an error.
  890. */
  891. errored: number;
  892. /**
  893. * Number of output items that failed to pass the evaluation.
  894. */
  895. failed: number;
  896. /**
  897. * Number of output items that passed the evaluation.
  898. */
  899. passed: number;
  900. /**
  901. * Total number of executed output items.
  902. */
  903. total: number;
  904. }
  905. }
  906. /**
  907. * A schema representing an evaluation run.
  908. */
  909. export interface RunListResponse {
  910. /**
  911. * Unique identifier for the evaluation run.
  912. */
  913. id: string;
  914. /**
  915. * Unix timestamp (in seconds) when the evaluation run was created.
  916. */
  917. created_at: number;
  918. /**
  919. * Information about the run's data source.
  920. */
  921. data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource | RunListResponse.Responses;
  922. /**
  923. * An object representing an error response from the Eval API.
  924. */
  925. error: EvalAPIError;
  926. /**
  927. * The identifier of the associated evaluation.
  928. */
  929. eval_id: string;
  930. /**
  931. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  932. * for storing additional information about the object in a structured format, and
  933. * querying for objects via API or the dashboard.
  934. *
  935. * Keys are strings with a maximum length of 64 characters. Values are strings with
  936. * a maximum length of 512 characters.
  937. */
  938. metadata: Shared.Metadata | null;
  939. /**
  940. * The model that is evaluated, if applicable.
  941. */
  942. model: string;
  943. /**
  944. * The name of the evaluation run.
  945. */
  946. name: string;
  947. /**
  948. * The type of the object. Always "eval.run".
  949. */
  950. object: 'eval.run';
  951. /**
  952. * Usage statistics for each model during the evaluation run.
  953. */
  954. per_model_usage: Array<RunListResponse.PerModelUsage>;
  955. /**
  956. * Results per testing criteria applied during the evaluation run.
  957. */
  958. per_testing_criteria_results: Array<RunListResponse.PerTestingCriteriaResult>;
  959. /**
  960. * The URL to the rendered evaluation run report on the UI dashboard.
  961. */
  962. report_url: string;
  963. /**
  964. * Counters summarizing the outcomes of the evaluation run.
  965. */
  966. result_counts: RunListResponse.ResultCounts;
  967. /**
  968. * The status of the evaluation run.
  969. */
  970. status: string;
  971. }
  972. export declare namespace RunListResponse {
  973. /**
  974. * A ResponsesRunDataSource object describing a model sampling configuration.
  975. */
  976. interface Responses {
  977. /**
  978. * Determines what populates the `item` namespace in this run's data source.
  979. */
  980. source: Responses.FileContent | Responses.FileID | Responses.Responses;
  981. /**
  982. * The type of run data source. Always `responses`.
  983. */
  984. type: 'responses';
  985. /**
  986. * Used when sampling from a model. Dictates the structure of the messages passed
  987. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  988. * `item.input_trajectory`), or a template with variable references to the `item`
  989. * namespace.
  990. */
  991. input_messages?: Responses.Template | Responses.ItemReference;
  992. /**
  993. * The name of the model to use for generating completions (e.g. "o3-mini").
  994. */
  995. model?: string;
  996. sampling_params?: Responses.SamplingParams;
  997. }
  998. namespace Responses {
  999. interface FileContent {
  1000. /**
  1001. * The content of the jsonl file.
  1002. */
  1003. content: Array<FileContent.Content>;
  1004. /**
  1005. * The type of jsonl source. Always `file_content`.
  1006. */
  1007. type: 'file_content';
  1008. }
  1009. namespace FileContent {
  1010. interface Content {
  1011. item: Record<string, unknown>;
  1012. sample?: Record<string, unknown>;
  1013. }
  1014. }
  1015. interface FileID {
  1016. /**
  1017. * The identifier of the file.
  1018. */
  1019. id: string;
  1020. /**
  1021. * The type of jsonl source. Always `file_id`.
  1022. */
  1023. type: 'file_id';
  1024. }
  1025. /**
  1026. * A EvalResponsesSource object describing a run data source configuration.
  1027. */
  1028. interface Responses {
  1029. /**
  1030. * The type of run data source. Always `responses`.
  1031. */
  1032. type: 'responses';
  1033. /**
  1034. * Only include items created after this timestamp (inclusive). This is a query
  1035. * parameter used to select responses.
  1036. */
  1037. created_after?: number | null;
  1038. /**
  1039. * Only include items created before this timestamp (inclusive). This is a query
  1040. * parameter used to select responses.
  1041. */
  1042. created_before?: number | null;
  1043. /**
  1044. * Optional string to search the 'instructions' field. This is a query parameter
  1045. * used to select responses.
  1046. */
  1047. instructions_search?: string | null;
  1048. /**
  1049. * Metadata filter for the responses. This is a query parameter used to select
  1050. * responses.
  1051. */
  1052. metadata?: unknown | null;
  1053. /**
  1054. * The name of the model to find responses for. This is a query parameter used to
  1055. * select responses.
  1056. */
  1057. model?: string | null;
  1058. /**
  1059. * Optional reasoning effort parameter. This is a query parameter used to select
  1060. * responses.
  1061. */
  1062. reasoning_effort?: Shared.ReasoningEffort | null;
  1063. /**
  1064. * Sampling temperature. This is a query parameter used to select responses.
  1065. */
  1066. temperature?: number | null;
  1067. /**
  1068. * List of tool names. This is a query parameter used to select responses.
  1069. */
  1070. tools?: Array<string> | null;
  1071. /**
  1072. * Nucleus sampling parameter. This is a query parameter used to select responses.
  1073. */
  1074. top_p?: number | null;
  1075. /**
  1076. * List of user identifiers. This is a query parameter used to select responses.
  1077. */
  1078. users?: Array<string> | null;
  1079. }
  1080. interface Template {
  1081. /**
  1082. * A list of chat messages forming the prompt or context. May include variable
  1083. * references to the `item` namespace, ie {{item.name}}.
  1084. */
  1085. template: Array<Template.ChatMessage | Template.EvalItem>;
  1086. /**
  1087. * The type of input messages. Always `template`.
  1088. */
  1089. type: 'template';
  1090. }
  1091. namespace Template {
  1092. interface ChatMessage {
  1093. /**
  1094. * The content of the message.
  1095. */
  1096. content: string;
  1097. /**
  1098. * The role of the message (e.g. "system", "assistant", "user").
  1099. */
  1100. role: string;
  1101. }
  1102. /**
  1103. * A message input to the model with a role indicating instruction following
  1104. * hierarchy. Instructions given with the `developer` or `system` role take
  1105. * precedence over instructions given with the `user` role. Messages with the
  1106. * `assistant` role are presumed to have been generated by the model in previous
  1107. * interactions.
  1108. */
  1109. interface EvalItem {
  1110. /**
  1111. * Text inputs to the model - can contain template strings.
  1112. */
  1113. content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
  1114. /**
  1115. * The role of the message input. One of `user`, `assistant`, `system`, or
  1116. * `developer`.
  1117. */
  1118. role: 'user' | 'assistant' | 'system' | 'developer';
  1119. /**
  1120. * The type of the message input. Always `message`.
  1121. */
  1122. type?: 'message';
  1123. }
  1124. namespace EvalItem {
  1125. /**
  1126. * A text output from the model.
  1127. */
  1128. interface OutputText {
  1129. /**
  1130. * The text output from the model.
  1131. */
  1132. text: string;
  1133. /**
  1134. * The type of the output text. Always `output_text`.
  1135. */
  1136. type: 'output_text';
  1137. }
  1138. }
  1139. }
  1140. interface ItemReference {
  1141. /**
  1142. * A reference to a variable in the `item` namespace. Ie, "item.name"
  1143. */
  1144. item_reference: string;
  1145. /**
  1146. * The type of input messages. Always `item_reference`.
  1147. */
  1148. type: 'item_reference';
  1149. }
  1150. interface SamplingParams {
  1151. /**
  1152. * The maximum number of tokens in the generated output.
  1153. */
  1154. max_completion_tokens?: number;
  1155. /**
  1156. * A seed value to initialize the randomness, during sampling.
  1157. */
  1158. seed?: number;
  1159. /**
  1160. * A higher temperature increases randomness in the outputs.
  1161. */
  1162. temperature?: number;
  1163. /**
  1164. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  1165. */
  1166. top_p?: number;
  1167. }
  1168. }
  1169. interface PerModelUsage {
  1170. /**
  1171. * The number of tokens retrieved from cache.
  1172. */
  1173. cached_tokens: number;
  1174. /**
  1175. * The number of completion tokens generated.
  1176. */
  1177. completion_tokens: number;
  1178. /**
  1179. * The number of invocations.
  1180. */
  1181. invocation_count: number;
  1182. /**
  1183. * The name of the model.
  1184. */
  1185. model_name: string;
  1186. /**
  1187. * The number of prompt tokens used.
  1188. */
  1189. prompt_tokens: number;
  1190. /**
  1191. * The total number of tokens used.
  1192. */
  1193. total_tokens: number;
  1194. }
  1195. interface PerTestingCriteriaResult {
  1196. /**
  1197. * Number of tests failed for this criteria.
  1198. */
  1199. failed: number;
  1200. /**
  1201. * Number of tests passed for this criteria.
  1202. */
  1203. passed: number;
  1204. /**
  1205. * A description of the testing criteria.
  1206. */
  1207. testing_criteria: string;
  1208. }
  1209. /**
  1210. * Counters summarizing the outcomes of the evaluation run.
  1211. */
  1212. interface ResultCounts {
  1213. /**
  1214. * Number of output items that resulted in an error.
  1215. */
  1216. errored: number;
  1217. /**
  1218. * Number of output items that failed to pass the evaluation.
  1219. */
  1220. failed: number;
  1221. /**
  1222. * Number of output items that passed the evaluation.
  1223. */
  1224. passed: number;
  1225. /**
  1226. * Total number of executed output items.
  1227. */
  1228. total: number;
  1229. }
  1230. }
  1231. export interface RunDeleteResponse {
  1232. deleted?: boolean;
  1233. object?: string;
  1234. run_id?: string;
  1235. }
  1236. /**
  1237. * A schema representing an evaluation run.
  1238. */
  1239. export interface RunCancelResponse {
  1240. /**
  1241. * Unique identifier for the evaluation run.
  1242. */
  1243. id: string;
  1244. /**
  1245. * Unix timestamp (in seconds) when the evaluation run was created.
  1246. */
  1247. created_at: number;
  1248. /**
  1249. * Information about the run's data source.
  1250. */
  1251. data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource | RunCancelResponse.Responses;
  1252. /**
  1253. * An object representing an error response from the Eval API.
  1254. */
  1255. error: EvalAPIError;
  1256. /**
  1257. * The identifier of the associated evaluation.
  1258. */
  1259. eval_id: string;
  1260. /**
  1261. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  1262. * for storing additional information about the object in a structured format, and
  1263. * querying for objects via API or the dashboard.
  1264. *
  1265. * Keys are strings with a maximum length of 64 characters. Values are strings with
  1266. * a maximum length of 512 characters.
  1267. */
  1268. metadata: Shared.Metadata | null;
  1269. /**
  1270. * The model that is evaluated, if applicable.
  1271. */
  1272. model: string;
  1273. /**
  1274. * The name of the evaluation run.
  1275. */
  1276. name: string;
  1277. /**
  1278. * The type of the object. Always "eval.run".
  1279. */
  1280. object: 'eval.run';
  1281. /**
  1282. * Usage statistics for each model during the evaluation run.
  1283. */
  1284. per_model_usage: Array<RunCancelResponse.PerModelUsage>;
  1285. /**
  1286. * Results per testing criteria applied during the evaluation run.
  1287. */
  1288. per_testing_criteria_results: Array<RunCancelResponse.PerTestingCriteriaResult>;
  1289. /**
  1290. * The URL to the rendered evaluation run report on the UI dashboard.
  1291. */
  1292. report_url: string;
  1293. /**
  1294. * Counters summarizing the outcomes of the evaluation run.
  1295. */
  1296. result_counts: RunCancelResponse.ResultCounts;
  1297. /**
  1298. * The status of the evaluation run.
  1299. */
  1300. status: string;
  1301. }
  1302. export declare namespace RunCancelResponse {
  1303. /**
  1304. * A ResponsesRunDataSource object describing a model sampling configuration.
  1305. */
  1306. interface Responses {
  1307. /**
  1308. * Determines what populates the `item` namespace in this run's data source.
  1309. */
  1310. source: Responses.FileContent | Responses.FileID | Responses.Responses;
  1311. /**
  1312. * The type of run data source. Always `responses`.
  1313. */
  1314. type: 'responses';
  1315. /**
  1316. * Used when sampling from a model. Dictates the structure of the messages passed
  1317. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  1318. * `item.input_trajectory`), or a template with variable references to the `item`
  1319. * namespace.
  1320. */
  1321. input_messages?: Responses.Template | Responses.ItemReference;
  1322. /**
  1323. * The name of the model to use for generating completions (e.g. "o3-mini").
  1324. */
  1325. model?: string;
  1326. sampling_params?: Responses.SamplingParams;
  1327. }
  1328. namespace Responses {
  1329. interface FileContent {
  1330. /**
  1331. * The content of the jsonl file.
  1332. */
  1333. content: Array<FileContent.Content>;
  1334. /**
  1335. * The type of jsonl source. Always `file_content`.
  1336. */
  1337. type: 'file_content';
  1338. }
  1339. namespace FileContent {
  1340. interface Content {
  1341. item: Record<string, unknown>;
  1342. sample?: Record<string, unknown>;
  1343. }
  1344. }
  1345. interface FileID {
  1346. /**
  1347. * The identifier of the file.
  1348. */
  1349. id: string;
  1350. /**
  1351. * The type of jsonl source. Always `file_id`.
  1352. */
  1353. type: 'file_id';
  1354. }
  1355. /**
  1356. * A EvalResponsesSource object describing a run data source configuration.
  1357. */
  1358. interface Responses {
  1359. /**
  1360. * The type of run data source. Always `responses`.
  1361. */
  1362. type: 'responses';
  1363. /**
  1364. * Only include items created after this timestamp (inclusive). This is a query
  1365. * parameter used to select responses.
  1366. */
  1367. created_after?: number | null;
  1368. /**
  1369. * Only include items created before this timestamp (inclusive). This is a query
  1370. * parameter used to select responses.
  1371. */
  1372. created_before?: number | null;
  1373. /**
  1374. * Optional string to search the 'instructions' field. This is a query parameter
  1375. * used to select responses.
  1376. */
  1377. instructions_search?: string | null;
  1378. /**
  1379. * Metadata filter for the responses. This is a query parameter used to select
  1380. * responses.
  1381. */
  1382. metadata?: unknown | null;
  1383. /**
  1384. * The name of the model to find responses for. This is a query parameter used to
  1385. * select responses.
  1386. */
  1387. model?: string | null;
  1388. /**
  1389. * Optional reasoning effort parameter. This is a query parameter used to select
  1390. * responses.
  1391. */
  1392. reasoning_effort?: Shared.ReasoningEffort | null;
  1393. /**
  1394. * Sampling temperature. This is a query parameter used to select responses.
  1395. */
  1396. temperature?: number | null;
  1397. /**
  1398. * List of tool names. This is a query parameter used to select responses.
  1399. */
  1400. tools?: Array<string> | null;
  1401. /**
  1402. * Nucleus sampling parameter. This is a query parameter used to select responses.
  1403. */
  1404. top_p?: number | null;
  1405. /**
  1406. * List of user identifiers. This is a query parameter used to select responses.
  1407. */
  1408. users?: Array<string> | null;
  1409. }
  1410. interface Template {
  1411. /**
  1412. * A list of chat messages forming the prompt or context. May include variable
  1413. * references to the `item` namespace, ie {{item.name}}.
  1414. */
  1415. template: Array<Template.ChatMessage | Template.EvalItem>;
  1416. /**
  1417. * The type of input messages. Always `template`.
  1418. */
  1419. type: 'template';
  1420. }
  1421. namespace Template {
  1422. interface ChatMessage {
  1423. /**
  1424. * The content of the message.
  1425. */
  1426. content: string;
  1427. /**
  1428. * The role of the message (e.g. "system", "assistant", "user").
  1429. */
  1430. role: string;
  1431. }
  1432. /**
  1433. * A message input to the model with a role indicating instruction following
  1434. * hierarchy. Instructions given with the `developer` or `system` role take
  1435. * precedence over instructions given with the `user` role. Messages with the
  1436. * `assistant` role are presumed to have been generated by the model in previous
  1437. * interactions.
  1438. */
  1439. interface EvalItem {
  1440. /**
  1441. * Text inputs to the model - can contain template strings.
  1442. */
  1443. content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
  1444. /**
  1445. * The role of the message input. One of `user`, `assistant`, `system`, or
  1446. * `developer`.
  1447. */
  1448. role: 'user' | 'assistant' | 'system' | 'developer';
  1449. /**
  1450. * The type of the message input. Always `message`.
  1451. */
  1452. type?: 'message';
  1453. }
  1454. namespace EvalItem {
  1455. /**
  1456. * A text output from the model.
  1457. */
  1458. interface OutputText {
  1459. /**
  1460. * The text output from the model.
  1461. */
  1462. text: string;
  1463. /**
  1464. * The type of the output text. Always `output_text`.
  1465. */
  1466. type: 'output_text';
  1467. }
  1468. }
  1469. }
  1470. interface ItemReference {
  1471. /**
  1472. * A reference to a variable in the `item` namespace. Ie, "item.name"
  1473. */
  1474. item_reference: string;
  1475. /**
  1476. * The type of input messages. Always `item_reference`.
  1477. */
  1478. type: 'item_reference';
  1479. }
  1480. interface SamplingParams {
  1481. /**
  1482. * The maximum number of tokens in the generated output.
  1483. */
  1484. max_completion_tokens?: number;
  1485. /**
  1486. * A seed value to initialize the randomness, during sampling.
  1487. */
  1488. seed?: number;
  1489. /**
  1490. * A higher temperature increases randomness in the outputs.
  1491. */
  1492. temperature?: number;
  1493. /**
  1494. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  1495. */
  1496. top_p?: number;
  1497. }
  1498. }
  1499. interface PerModelUsage {
  1500. /**
  1501. * The number of tokens retrieved from cache.
  1502. */
  1503. cached_tokens: number;
  1504. /**
  1505. * The number of completion tokens generated.
  1506. */
  1507. completion_tokens: number;
  1508. /**
  1509. * The number of invocations.
  1510. */
  1511. invocation_count: number;
  1512. /**
  1513. * The name of the model.
  1514. */
  1515. model_name: string;
  1516. /**
  1517. * The number of prompt tokens used.
  1518. */
  1519. prompt_tokens: number;
  1520. /**
  1521. * The total number of tokens used.
  1522. */
  1523. total_tokens: number;
  1524. }
  1525. interface PerTestingCriteriaResult {
  1526. /**
  1527. * Number of tests failed for this criteria.
  1528. */
  1529. failed: number;
  1530. /**
  1531. * Number of tests passed for this criteria.
  1532. */
  1533. passed: number;
  1534. /**
  1535. * A description of the testing criteria.
  1536. */
  1537. testing_criteria: string;
  1538. }
  1539. /**
  1540. * Counters summarizing the outcomes of the evaluation run.
  1541. */
  1542. interface ResultCounts {
  1543. /**
  1544. * Number of output items that resulted in an error.
  1545. */
  1546. errored: number;
  1547. /**
  1548. * Number of output items that failed to pass the evaluation.
  1549. */
  1550. failed: number;
  1551. /**
  1552. * Number of output items that passed the evaluation.
  1553. */
  1554. passed: number;
  1555. /**
  1556. * Total number of executed output items.
  1557. */
  1558. total: number;
  1559. }
  1560. }
  1561. export interface RunCreateParams {
  1562. /**
  1563. * Details about the run's data source.
  1564. */
  1565. data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource | RunCreateParams.CreateEvalResponsesRunDataSource;
  1566. /**
  1567. * Set of 16 key-value pairs that can be attached to an object. This can be useful
  1568. * for storing additional information about the object in a structured format, and
  1569. * querying for objects via API or the dashboard.
  1570. *
  1571. * Keys are strings with a maximum length of 64 characters. Values are strings with
  1572. * a maximum length of 512 characters.
  1573. */
  1574. metadata?: Shared.Metadata | null;
  1575. /**
  1576. * The name of the run.
  1577. */
  1578. name?: string;
  1579. }
  1580. export declare namespace RunCreateParams {
  1581. /**
  1582. * A ResponsesRunDataSource object describing a model sampling configuration.
  1583. */
  1584. interface CreateEvalResponsesRunDataSource {
  1585. /**
  1586. * Determines what populates the `item` namespace in this run's data source.
  1587. */
  1588. source: CreateEvalResponsesRunDataSource.FileContent | CreateEvalResponsesRunDataSource.FileID | CreateEvalResponsesRunDataSource.Responses;
  1589. /**
  1590. * The type of run data source. Always `responses`.
  1591. */
  1592. type: 'responses';
  1593. /**
  1594. * Used when sampling from a model. Dictates the structure of the messages passed
  1595. * into the model. Can either be a reference to a prebuilt trajectory (ie,
  1596. * `item.input_trajectory`), or a template with variable references to the `item`
  1597. * namespace.
  1598. */
  1599. input_messages?: CreateEvalResponsesRunDataSource.Template | CreateEvalResponsesRunDataSource.ItemReference;
  1600. /**
  1601. * The name of the model to use for generating completions (e.g. "o3-mini").
  1602. */
  1603. model?: string;
  1604. sampling_params?: CreateEvalResponsesRunDataSource.SamplingParams;
  1605. }
  1606. namespace CreateEvalResponsesRunDataSource {
  1607. interface FileContent {
  1608. /**
  1609. * The content of the jsonl file.
  1610. */
  1611. content: Array<FileContent.Content>;
  1612. /**
  1613. * The type of jsonl source. Always `file_content`.
  1614. */
  1615. type: 'file_content';
  1616. }
  1617. namespace FileContent {
  1618. interface Content {
  1619. item: Record<string, unknown>;
  1620. sample?: Record<string, unknown>;
  1621. }
  1622. }
  1623. interface FileID {
  1624. /**
  1625. * The identifier of the file.
  1626. */
  1627. id: string;
  1628. /**
  1629. * The type of jsonl source. Always `file_id`.
  1630. */
  1631. type: 'file_id';
  1632. }
  1633. /**
  1634. * A EvalResponsesSource object describing a run data source configuration.
  1635. */
  1636. interface Responses {
  1637. /**
  1638. * The type of run data source. Always `responses`.
  1639. */
  1640. type: 'responses';
  1641. /**
  1642. * Only include items created after this timestamp (inclusive). This is a query
  1643. * parameter used to select responses.
  1644. */
  1645. created_after?: number | null;
  1646. /**
  1647. * Only include items created before this timestamp (inclusive). This is a query
  1648. * parameter used to select responses.
  1649. */
  1650. created_before?: number | null;
  1651. /**
  1652. * Optional string to search the 'instructions' field. This is a query parameter
  1653. * used to select responses.
  1654. */
  1655. instructions_search?: string | null;
  1656. /**
  1657. * Metadata filter for the responses. This is a query parameter used to select
  1658. * responses.
  1659. */
  1660. metadata?: unknown | null;
  1661. /**
  1662. * The name of the model to find responses for. This is a query parameter used to
  1663. * select responses.
  1664. */
  1665. model?: string | null;
  1666. /**
  1667. * Optional reasoning effort parameter. This is a query parameter used to select
  1668. * responses.
  1669. */
  1670. reasoning_effort?: Shared.ReasoningEffort | null;
  1671. /**
  1672. * Sampling temperature. This is a query parameter used to select responses.
  1673. */
  1674. temperature?: number | null;
  1675. /**
  1676. * List of tool names. This is a query parameter used to select responses.
  1677. */
  1678. tools?: Array<string> | null;
  1679. /**
  1680. * Nucleus sampling parameter. This is a query parameter used to select responses.
  1681. */
  1682. top_p?: number | null;
  1683. /**
  1684. * List of user identifiers. This is a query parameter used to select responses.
  1685. */
  1686. users?: Array<string> | null;
  1687. }
  1688. interface Template {
  1689. /**
  1690. * A list of chat messages forming the prompt or context. May include variable
  1691. * references to the `item` namespace, ie {{item.name}}.
  1692. */
  1693. template: Array<Template.ChatMessage | Template.EvalItem>;
  1694. /**
  1695. * The type of input messages. Always `template`.
  1696. */
  1697. type: 'template';
  1698. }
  1699. namespace Template {
  1700. interface ChatMessage {
  1701. /**
  1702. * The content of the message.
  1703. */
  1704. content: string;
  1705. /**
  1706. * The role of the message (e.g. "system", "assistant", "user").
  1707. */
  1708. role: string;
  1709. }
  1710. /**
  1711. * A message input to the model with a role indicating instruction following
  1712. * hierarchy. Instructions given with the `developer` or `system` role take
  1713. * precedence over instructions given with the `user` role. Messages with the
  1714. * `assistant` role are presumed to have been generated by the model in previous
  1715. * interactions.
  1716. */
  1717. interface EvalItem {
  1718. /**
  1719. * Text inputs to the model - can contain template strings.
  1720. */
  1721. content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
  1722. /**
  1723. * The role of the message input. One of `user`, `assistant`, `system`, or
  1724. * `developer`.
  1725. */
  1726. role: 'user' | 'assistant' | 'system' | 'developer';
  1727. /**
  1728. * The type of the message input. Always `message`.
  1729. */
  1730. type?: 'message';
  1731. }
  1732. namespace EvalItem {
  1733. /**
  1734. * A text output from the model.
  1735. */
  1736. interface OutputText {
  1737. /**
  1738. * The text output from the model.
  1739. */
  1740. text: string;
  1741. /**
  1742. * The type of the output text. Always `output_text`.
  1743. */
  1744. type: 'output_text';
  1745. }
  1746. }
  1747. }
  1748. interface ItemReference {
  1749. /**
  1750. * A reference to a variable in the `item` namespace. Ie, "item.name"
  1751. */
  1752. item_reference: string;
  1753. /**
  1754. * The type of input messages. Always `item_reference`.
  1755. */
  1756. type: 'item_reference';
  1757. }
  1758. interface SamplingParams {
  1759. /**
  1760. * The maximum number of tokens in the generated output.
  1761. */
  1762. max_completion_tokens?: number;
  1763. /**
  1764. * A seed value to initialize the randomness, during sampling.
  1765. */
  1766. seed?: number;
  1767. /**
  1768. * A higher temperature increases randomness in the outputs.
  1769. */
  1770. temperature?: number;
  1771. /**
  1772. * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
  1773. */
  1774. top_p?: number;
  1775. }
  1776. }
  1777. }
  1778. export interface RunListParams extends CursorPageParams {
  1779. /**
  1780. * Sort order for runs by timestamp. Use `asc` for ascending order or `desc` for
  1781. * descending order. Defaults to `asc`.
  1782. */
  1783. order?: 'asc' | 'desc';
  1784. /**
  1785. * Filter runs by status. One of `queued` | `in_progress` | `failed` | `completed`
  1786. * | `canceled`.
  1787. */
  1788. status?: 'queued' | 'in_progress' | 'completed' | 'canceled' | 'failed';
  1789. }
  1790. export declare namespace Runs {
  1791. export { type CreateEvalCompletionsRunDataSource as CreateEvalCompletionsRunDataSource, type CreateEvalJSONLRunDataSource as CreateEvalJSONLRunDataSource, type EvalAPIError as EvalAPIError, type RunCreateResponse as RunCreateResponse, type RunRetrieveResponse as RunRetrieveResponse, type RunListResponse as RunListResponse, type RunDeleteResponse as RunDeleteResponse, type RunCancelResponse as RunCancelResponse, RunListResponsesPage as RunListResponsesPage, type RunCreateParams as RunCreateParams, type RunListParams as RunListParams, };
  1792. export { OutputItems as OutputItems, type OutputItemRetrieveResponse as OutputItemRetrieveResponse, type OutputItemListResponse as OutputItemListResponse, OutputItemListResponsesPage as OutputItemListResponsesPage, type OutputItemListParams as OutputItemListParams, };
  1793. }
  1794. //# sourceMappingURL=runs.d.ts.map