benchmark.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942
  1. # Copyright (c) Microsoft Corporation. All rights reserved.
  2. # Copyright 2018 The HuggingFace Inc. team.
  3. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """Benchmarking the inference of pretrained transformer models.
  17. PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
  18. One difference is that random input_ids is generated in this benchmark.
  19. For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.
  20. Example commands:
  21. Export all models to ONNX, optimize and validate them:
  22. python benchmark.py -b 0 -o -v -i 1 2 3
  23. Run OnnxRuntime on GPU for all models:
  24. python benchmark.py -g
  25. Run OnnxRuntime on GPU for all models with fp32 optimization:
  26. python benchmark.py -g -o
  27. Run OnnxRuntime on GPU with fp16 optimization:
  28. python benchmark.py -g -o -p "fp16"
  29. Run TorchScript on GPU for all models:
  30. python benchmark.py -e torchscript -g
  31. Run TorchScript on GPU for all models with fp16:
  32. python benchmark.py -e torchscript -g -p "fp16"
  33. Run ONNXRuntime and TorchScript on CPU for all models with quantization:
  34. python benchmark.py -e torchscript onnxruntime -p "int8" -o
  35. Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
  36. python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm
  37. It is recommended to use run_benchmark.sh to launch benchmark.
  38. """
  39. import argparse
  40. import logging
  41. import os
  42. import timeit
  43. from datetime import datetime
  44. import numpy
  45. import psutil
  46. from benchmark_helper import (
  47. ConfigModifier,
  48. OptimizerInfo,
  49. Precision,
  50. create_onnxruntime_session,
  51. get_latency_result,
  52. inference_ort,
  53. inference_ort_with_io_binding,
  54. output_details,
  55. output_fusion_statistics,
  56. output_summary,
  57. setup_logger,
  58. )
  59. from fusion_options import FusionOptions
  60. from huggingface_models import MODEL_CLASSES, MODELS
  61. from onnx_exporter import (
  62. create_onnxruntime_input,
  63. export_onnx_model_from_pt,
  64. export_onnx_model_from_tf,
  65. load_pretrained_model,
  66. )
  67. from packaging import version
  68. from quantize_helper import QuantizeHelper
  69. logger = logging.getLogger("")
  70. cpu_count = psutil.cpu_count(logical=False)
  71. # Set OMP environment variable before importing onnxruntime or torch.
  72. if "OMP_NUM_THREADS" not in os.environ:
  73. os.environ["OMP_NUM_THREADS"] = str(cpu_count)
  74. import torch # noqa: E402
  75. from transformers import AutoConfig, AutoTokenizer, LxmertConfig # noqa: E402
  76. def run_onnxruntime(
  77. use_gpu,
  78. provider,
  79. model_names,
  80. model_class,
  81. config_modifier,
  82. precision,
  83. num_threads,
  84. batch_sizes,
  85. sequence_lengths,
  86. repeat_times,
  87. input_counts,
  88. optimizer_info,
  89. validate_onnx,
  90. cache_dir,
  91. onnx_dir,
  92. verbose,
  93. overwrite,
  94. disable_ort_io_binding,
  95. use_raw_attention_mask,
  96. model_fusion_statistics,
  97. model_source,
  98. enable_arm64_bfloat16_fastmath_mlas_gemm,
  99. args,
  100. ):
  101. import onnxruntime # noqa: PLC0415
  102. results = []
  103. if (
  104. use_gpu
  105. and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
  106. and ("MIGraphXExecutionProvider" not in onnxruntime.get_available_providers())
  107. and ("DmlExecutionProvider" not in onnxruntime.get_available_providers())
  108. ):
  109. logger.error(
  110. "Please install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
  111. )
  112. return results
  113. warm_up_repeat = 0
  114. if provider == "tensorrt":
  115. optimizer_info = OptimizerInfo.NOOPT
  116. warm_up_repeat = 5
  117. if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers():
  118. logger.error(
  119. "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
  120. )
  121. return results
  122. if optimizer_info == OptimizerInfo.NOOPT:
  123. logger.warning(
  124. f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
  125. )
  126. for model_name in model_names:
  127. all_input_names = MODELS[model_name][0]
  128. for num_inputs in input_counts:
  129. if num_inputs > len(all_input_names):
  130. break
  131. input_names = all_input_names[:num_inputs]
  132. args.model_type = MODELS[model_name][3]
  133. fusion_options = FusionOptions.parse(args)
  134. if "pt" in model_source:
  135. with torch.no_grad():
  136. (
  137. onnx_model_file,
  138. is_valid_onnx_model,
  139. vocab_size,
  140. max_sequence_length,
  141. ) = export_onnx_model_from_pt(
  142. model_name,
  143. MODELS[model_name][1],
  144. MODELS[model_name][2],
  145. MODELS[model_name][3],
  146. model_class,
  147. config_modifier,
  148. cache_dir,
  149. onnx_dir,
  150. input_names,
  151. use_gpu,
  152. precision,
  153. optimizer_info,
  154. validate_onnx,
  155. use_raw_attention_mask,
  156. overwrite,
  157. model_fusion_statistics,
  158. fusion_options,
  159. )
  160. if "tf" in model_source:
  161. (
  162. onnx_model_file,
  163. is_valid_onnx_model,
  164. vocab_size,
  165. max_sequence_length,
  166. ) = export_onnx_model_from_tf(
  167. model_name,
  168. MODELS[model_name][1],
  169. MODELS[model_name][2],
  170. MODELS[model_name][3],
  171. model_class,
  172. config_modifier,
  173. cache_dir,
  174. onnx_dir,
  175. input_names,
  176. use_gpu,
  177. precision,
  178. optimizer_info,
  179. validate_onnx,
  180. use_raw_attention_mask,
  181. overwrite,
  182. model_fusion_statistics,
  183. fusion_options,
  184. )
  185. if not is_valid_onnx_model:
  186. continue
  187. ort_session = create_onnxruntime_session(
  188. onnx_model_file,
  189. use_gpu,
  190. provider,
  191. enable_all_optimization=True,
  192. num_threads=num_threads,
  193. verbose=verbose,
  194. enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
  195. )
  196. if ort_session is None:
  197. continue
  198. ort_output_names = [node_arg.name for node_arg in ort_session.get_outputs()]
  199. output_buffers = []
  200. device = "cuda" if use_gpu else "cpu"
  201. config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
  202. max_last_state_size = numpy.prod(
  203. [
  204. max(batch_sizes),
  205. max(sequence_lengths),
  206. max(vocab_size, config.hidden_size),
  207. ]
  208. )
  209. max_pooler_size = numpy.prod([max(batch_sizes), config.hidden_size])
  210. for batch_size in batch_sizes:
  211. if batch_size <= 0:
  212. continue
  213. for sequence_length in sequence_lengths:
  214. if max_sequence_length is not None and sequence_length > max_sequence_length:
  215. continue
  216. input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
  217. ort_inputs = create_onnxruntime_input(
  218. vocab_size,
  219. batch_size,
  220. sequence_length,
  221. input_names,
  222. config,
  223. input_value_type,
  224. )
  225. result_template = {
  226. "engine": "onnxruntime",
  227. "version": onnxruntime.__version__,
  228. "providers": provider,
  229. "device": device,
  230. "optimizer": optimizer_info,
  231. "precision": precision,
  232. "io_binding": not disable_ort_io_binding,
  233. "model_name": model_name,
  234. "inputs": num_inputs,
  235. "threads": num_threads,
  236. "batch_size": batch_size,
  237. "sequence_length": sequence_length,
  238. "custom_layer_num": config_modifier.get_layer_num(),
  239. "datetime": str(datetime.now()),
  240. }
  241. if config.model_type in ["vit", "swin"]:
  242. logger.info(
  243. f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
  244. )
  245. else:
  246. logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
  247. if disable_ort_io_binding:
  248. result = inference_ort(
  249. ort_session,
  250. ort_inputs,
  251. result_template,
  252. repeat_times,
  253. batch_size,
  254. warm_up_repeat,
  255. )
  256. else:
  257. # Get output sizes from a dummy ort run
  258. ort_outputs = ort_session.run(ort_output_names, ort_inputs)
  259. output_buffer_max_sizes = [max_last_state_size]
  260. for i in range(len(ort_outputs)):
  261. if i == 2 and MODELS[model_name][3] == "gpt":
  262. # past state output max size
  263. output_buffer_max_sizes.append(max_pooler_size)
  264. else:
  265. output_buffer_max_sizes.append(max_last_state_size)
  266. data_type = numpy.longlong if "pt" in model_source else numpy.intc
  267. result = inference_ort_with_io_binding(
  268. ort_session,
  269. ort_inputs,
  270. result_template,
  271. repeat_times,
  272. ort_output_names,
  273. ort_outputs,
  274. output_buffers,
  275. output_buffer_max_sizes,
  276. batch_size,
  277. device,
  278. data_type,
  279. warm_up_repeat,
  280. )
  281. logger.info(result)
  282. results.append(result)
  283. return results
  284. def run_pytorch(
  285. use_gpu,
  286. model_names,
  287. model_class,
  288. config_modifier,
  289. precision,
  290. num_threads,
  291. batch_sizes,
  292. sequence_lengths,
  293. repeat_times,
  294. torchscript,
  295. torch2,
  296. cache_dir,
  297. verbose,
  298. ):
  299. results = []
  300. if use_gpu and not torch.cuda.is_available():
  301. logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
  302. return results
  303. torch.set_grad_enabled(False)
  304. for model_name in model_names:
  305. config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
  306. config_modifier.modify(config)
  307. model = load_pretrained_model(
  308. model_name,
  309. config=config,
  310. cache_dir=cache_dir,
  311. custom_model_class=model_class,
  312. )
  313. if config.model_type in ["vit", "swin"]:
  314. # These models don't use sequence lengths, so just pick the first sequence length so that the summary still works
  315. sequence_lengths = [sequence_lengths[0]]
  316. else:
  317. tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
  318. max_input_size = tokenizer.model_max_length
  319. logger.debug(f"Model {model}")
  320. logger.debug(f"Number of parameters {model.num_parameters()}")
  321. if precision == Precision.FLOAT16:
  322. model.half()
  323. device = torch.device("cuda:0" if use_gpu else "cpu")
  324. model.to(device)
  325. if precision == Precision.INT8:
  326. model = QuantizeHelper.quantize_torch_model(model)
  327. for batch_size in batch_sizes:
  328. if batch_size <= 0:
  329. continue
  330. for sequence_length in sequence_lengths:
  331. if config.model_type in ["vit", "swin"]:
  332. logger.info(
  333. f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
  334. )
  335. input_ids = torch.randn(
  336. size=(batch_size, 3, config.image_size, config.image_size),
  337. dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32,
  338. device=device,
  339. )
  340. else:
  341. if max_input_size is not None and sequence_length > max_input_size:
  342. continue
  343. logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
  344. input_ids = torch.randint(
  345. low=0,
  346. high=config.vocab_size - 1,
  347. size=(batch_size, sequence_length),
  348. dtype=torch.long,
  349. device=device,
  350. )
  351. try:
  352. inference = (
  353. torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model
  354. )
  355. inference(input_ids)
  356. runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1) # noqa: B023
  357. result = {
  358. "engine": "torchscript" if torchscript else "torch2" if torch2 else "torch",
  359. "version": torch.__version__,
  360. "providers": "NA",
  361. "device": "cuda" if use_gpu else "cpu",
  362. "optimizer": "",
  363. "precision": precision,
  364. "io_binding": "",
  365. "model_name": model_name,
  366. "inputs": 1,
  367. "threads": num_threads,
  368. "batch_size": batch_size,
  369. "sequence_length": sequence_length,
  370. "custom_layer_num": config_modifier.get_layer_num(),
  371. "datetime": str(datetime.now()),
  372. }
  373. result.update(get_latency_result(runtimes, batch_size))
  374. logger.info(result)
  375. results.append(result)
  376. except RuntimeError as e:
  377. logger.exception(e)
  378. torch.cuda.empty_cache()
  379. return results
  380. def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
  381. from functools import wraps # noqa: PLC0415
  382. import tensorflow as tf # noqa: PLC0415
  383. def run_func(func):
  384. @wraps(func)
  385. def run_in_eager_mode(*args, **kwargs):
  386. return func(*args, **kwargs)
  387. @wraps(func)
  388. @tf.function(experimental_compile=use_xla)
  389. def run_in_graph_mode(*args, **kwargs):
  390. return func(*args, **kwargs)
  391. if do_eager_mode is True:
  392. assert use_xla is False, (
  393. "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
  394. )
  395. return run_in_eager_mode
  396. else:
  397. return run_in_graph_mode
  398. return run_func
  399. def run_tensorflow(
  400. use_gpu,
  401. model_names,
  402. model_class,
  403. config_modifier,
  404. precision,
  405. num_threads,
  406. batch_sizes,
  407. sequence_lengths,
  408. repeat_times,
  409. cache_dir,
  410. verbose,
  411. ):
  412. results = []
  413. import tensorflow as tf # noqa: PLC0415
  414. tf.config.threading.set_intra_op_parallelism_threads(num_threads)
  415. if not use_gpu:
  416. tf.config.set_visible_devices([], "GPU")
  417. if use_gpu and not tf.test.is_built_with_cuda():
  418. logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
  419. return results
  420. if use_gpu: # Restrict TensorFlow to only use the first GPU
  421. physical_devices = tf.config.list_physical_devices("GPU")
  422. try:
  423. tf.config.set_visible_devices(physical_devices[0], "GPU")
  424. tf.config.experimental.set_memory_growth(physical_devices[0], True)
  425. tf.distribute.OneDeviceStrategy(device="/gpu:0")
  426. except RuntimeError as e:
  427. logger.exception(e)
  428. if precision == Precision.FLOAT16 or precision == Precision.INT8:
  429. raise NotImplementedError("Mixed precision is currently not supported.")
  430. for model_name in model_names:
  431. config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
  432. config_modifier.modify(config)
  433. model = load_pretrained_model(
  434. model_name,
  435. config=config,
  436. cache_dir=cache_dir,
  437. custom_model_class=model_class,
  438. is_tf_model=True,
  439. )
  440. tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
  441. max_input_size = tokenizer.model_max_length
  442. for batch_size in batch_sizes:
  443. if batch_size <= 0:
  444. continue
  445. for sequence_length in sequence_lengths:
  446. if max_input_size is not None and sequence_length > max_input_size:
  447. continue
  448. logger.info(f"Run Tensorflow on {model_name} with input shape {[batch_size, sequence_length]}")
  449. import random # noqa: PLC0415
  450. rng = random.Random()
  451. values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
  452. input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
  453. try:
  454. # Disable both for better inference perf
  455. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  456. def encoder_forward():
  457. return model(input_ids, training=False) # noqa: B023
  458. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  459. def encoder_decoder_forward():
  460. return model(input_ids, decoder_input_ids=input_ids, training=False) # noqa: B023
  461. @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
  462. def lxmert_forward():
  463. feats = tf.random.normal([1, 1, config.visual_feat_dim]) # noqa: B023
  464. pos = tf.random.normal([1, 1, config.visual_pos_dim]) # noqa: B023
  465. return model( # noqa: B023
  466. input_ids, # noqa: B023
  467. visual_feats=feats,
  468. visual_pos=pos,
  469. training=False,
  470. )
  471. inference = encoder_forward
  472. if config.is_encoder_decoder:
  473. inference = encoder_decoder_forward
  474. elif isinstance(config, LxmertConfig):
  475. inference = lxmert_forward
  476. inference()
  477. runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1) # noqa: B023
  478. result = {
  479. "engine": "tensorflow",
  480. "version": tf.__version__,
  481. "providers": "NA",
  482. "device": "cuda" if use_gpu else "cpu",
  483. "optimizer": "",
  484. "precision": precision,
  485. "io_binding": "",
  486. "model_name": model_name,
  487. "inputs": 1,
  488. "threads": num_threads,
  489. "batch_size": batch_size,
  490. "sequence_length": sequence_length,
  491. "custom_layer_num": config_modifier.get_layer_num(),
  492. "datetime": str(datetime.now()),
  493. }
  494. result.update(get_latency_result(runtimes, batch_size))
  495. logger.info(result)
  496. results.append(result)
  497. except RuntimeError as e:
  498. logger.exception(e)
  499. from numba import cuda # noqa: PLC0415
  500. device = cuda.get_current_device()
  501. device.reset()
  502. return results
  503. def parse_arguments():
  504. parser = argparse.ArgumentParser()
  505. parser.add_argument(
  506. "-m",
  507. "--models",
  508. required=False,
  509. nargs="+",
  510. type=str,
  511. default=["bert-base-cased", "roberta-base", "gpt2"],
  512. choices=list(MODELS.keys()),
  513. help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
  514. )
  515. parser.add_argument(
  516. "--model_source",
  517. required=False,
  518. nargs=1,
  519. type=str,
  520. default="pt",
  521. choices=["pt", "tf"],
  522. help="Export onnx from pt or tf",
  523. )
  524. parser.add_argument(
  525. "--model_class",
  526. required=False,
  527. type=str,
  528. default=None,
  529. choices=list(MODEL_CLASSES),
  530. help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
  531. )
  532. parser.add_argument(
  533. "-e",
  534. "--engines",
  535. required=False,
  536. nargs="+",
  537. type=str,
  538. default=["onnxruntime"],
  539. choices=["onnxruntime", "torch", "torch2", "torchscript", "tensorflow"],
  540. help="Engines to benchmark",
  541. )
  542. parser.add_argument(
  543. "-c",
  544. "--cache_dir",
  545. required=False,
  546. type=str,
  547. default=os.path.join(".", "cache_models"),
  548. help="Directory to cache pre-trained models",
  549. )
  550. parser.add_argument(
  551. "--onnx_dir",
  552. required=False,
  553. type=str,
  554. default=os.path.join(".", "onnx_models"),
  555. help="Directory to store onnx models",
  556. )
  557. parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
  558. parser.add_argument(
  559. "--provider",
  560. required=False,
  561. type=str,
  562. default=None,
  563. help="Execution provider to use",
  564. )
  565. parser.add_argument(
  566. "-p",
  567. "--precision",
  568. type=Precision,
  569. default=Precision.FLOAT32,
  570. choices=list(Precision),
  571. help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
  572. )
  573. parser.add_argument("--verbose", required=False, action="store_true", help="Print more information")
  574. parser.add_argument(
  575. "--overwrite",
  576. required=False,
  577. action="store_true",
  578. help="Overwrite existing models",
  579. )
  580. parser.add_argument(
  581. "-o",
  582. "--optimizer_info",
  583. type=OptimizerInfo,
  584. default=OptimizerInfo.BYSCRIPT,
  585. choices=list(OptimizerInfo),
  586. help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
  587. )
  588. parser.add_argument(
  589. "-v",
  590. "--validate_onnx",
  591. required=False,
  592. action="store_true",
  593. help="Validate ONNX model",
  594. )
  595. parser.add_argument(
  596. "-f",
  597. "--fusion_csv",
  598. required=False,
  599. default=None,
  600. help="CSV file for saving summary results of graph optimization.",
  601. )
  602. parser.add_argument(
  603. "-d",
  604. "--detail_csv",
  605. required=False,
  606. default=None,
  607. help="CSV file for saving detail results.",
  608. )
  609. parser.add_argument(
  610. "-r",
  611. "--result_csv",
  612. required=False,
  613. default=None,
  614. help="CSV file for saving summary results.",
  615. )
  616. parser.add_argument(
  617. "-i",
  618. "--input_counts",
  619. required=False,
  620. nargs="+",
  621. default=[1],
  622. type=int,
  623. choices=[1, 2, 3],
  624. help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
  625. )
  626. parser.add_argument(
  627. "-t",
  628. "--test_times",
  629. required=False,
  630. default=100,
  631. type=int,
  632. help="Number of repeat times to get average inference latency.",
  633. )
  634. parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1])
  635. parser.add_argument(
  636. "-s",
  637. "--sequence_lengths",
  638. nargs="+",
  639. type=int,
  640. default=[4, 8, 16, 32, 64, 128, 256],
  641. )
  642. parser.add_argument(
  643. "--disable_ort_io_binding",
  644. required=False,
  645. action="store_true",
  646. help="Disable running ONNX Runtime with binded inputs and outputs. ",
  647. )
  648. parser.set_defaults(disable_ort_io_binding=False)
  649. parser.add_argument(
  650. "-n",
  651. "--num_threads",
  652. required=False,
  653. nargs="+",
  654. type=int,
  655. default=[0],
  656. help="Threads to use",
  657. )
  658. parser.add_argument(
  659. "--force_num_layers",
  660. required=False,
  661. type=int,
  662. default=None,
  663. help="Manually set the model's layer number",
  664. )
  665. parser.add_argument(
  666. "--enable_arm64_bfloat16_fastmath_mlas_gemm",
  667. required=False,
  668. action="store_true",
  669. help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
  670. )
  671. parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
  672. FusionOptions.add_arguments(parser)
  673. args = parser.parse_args()
  674. return args
  675. def main():
  676. args = parse_arguments()
  677. setup_logger(args.verbose)
  678. if args.precision == Precision.FLOAT16 and not args.use_gpu:
  679. logger.error("fp16 is for GPU only")
  680. return
  681. if args.precision == Precision.INT8 and args.use_gpu and args.provider not in ["migraphx"]:
  682. logger.error("int8 is for CPU only")
  683. return
  684. if len(args.models) == 1 and MODELS[args.models[0]][3] in ["vit", "swim"]:
  685. args.sequence_lengths = [""]
  686. args.num_threads = sorted({cpu_count if x <= 0 else x for x in args.num_threads})
  687. logger.info(f"Arguments: {args}")
  688. if not os.path.exists(args.cache_dir):
  689. try:
  690. os.mkdir(args.cache_dir)
  691. except OSError:
  692. logger.error("Creation of the directory %s failed", args.cache_dir)
  693. enable_torch = "torch" in args.engines
  694. enable_torch2 = "torch2" in args.engines
  695. enable_torchscript = "torchscript" in args.engines
  696. enable_onnxruntime = "onnxruntime" in args.engines
  697. enable_tensorflow = "tensorflow" in args.engines
  698. if enable_torch2 and version.parse(torch.__version__) < version.parse("2.0.0"):
  699. logger.error(f"PyTorch version must be >=2.0.0 and you are using {torch.__version__}")
  700. return
  701. config_modifier = ConfigModifier(args.force_num_layers)
  702. results = []
  703. for num_threads in args.num_threads:
  704. torch.set_num_threads(num_threads)
  705. logger.debug(torch.__config__.parallel_info())
  706. if enable_torch or enable_torch2 or enable_torchscript:
  707. if args.input_counts != [1]:
  708. logger.warning("--input_counts is not implemented for torch or torchscript engine.")
  709. if enable_torchscript:
  710. results += run_pytorch(
  711. args.use_gpu,
  712. args.models,
  713. args.model_class,
  714. config_modifier,
  715. args.precision,
  716. num_threads,
  717. args.batch_sizes,
  718. args.sequence_lengths,
  719. args.test_times,
  720. True,
  721. False,
  722. args.cache_dir,
  723. args.verbose,
  724. )
  725. if enable_torch:
  726. results += run_pytorch(
  727. args.use_gpu,
  728. args.models,
  729. args.model_class,
  730. config_modifier,
  731. args.precision,
  732. num_threads,
  733. args.batch_sizes,
  734. args.sequence_lengths,
  735. args.test_times,
  736. False,
  737. False,
  738. args.cache_dir,
  739. args.verbose,
  740. )
  741. if enable_torch2:
  742. results += run_pytorch(
  743. args.use_gpu,
  744. args.models,
  745. args.model_class,
  746. config_modifier,
  747. args.precision,
  748. num_threads,
  749. args.batch_sizes,
  750. args.sequence_lengths,
  751. args.test_times,
  752. False,
  753. True,
  754. args.cache_dir,
  755. args.verbose,
  756. )
  757. if enable_tensorflow:
  758. results += run_tensorflow(
  759. args.use_gpu,
  760. args.models,
  761. args.model_class,
  762. config_modifier,
  763. args.precision,
  764. num_threads,
  765. args.batch_sizes,
  766. args.sequence_lengths,
  767. args.test_times,
  768. args.cache_dir,
  769. args.verbose,
  770. )
  771. model_fusion_statistics = {}
  772. if enable_onnxruntime:
  773. try:
  774. use_raw_attention_mask = not args.use_mask_index
  775. results += run_onnxruntime(
  776. args.use_gpu,
  777. args.provider,
  778. args.models,
  779. args.model_class,
  780. config_modifier,
  781. args.precision,
  782. num_threads,
  783. args.batch_sizes,
  784. args.sequence_lengths,
  785. args.test_times,
  786. args.input_counts,
  787. args.optimizer_info,
  788. args.validate_onnx,
  789. args.cache_dir,
  790. args.onnx_dir,
  791. args.verbose,
  792. args.overwrite,
  793. args.disable_ort_io_binding,
  794. use_raw_attention_mask,
  795. model_fusion_statistics,
  796. args.model_source,
  797. args.enable_arm64_bfloat16_fastmath_mlas_gemm,
  798. args,
  799. )
  800. except Exception:
  801. logger.exception("Exception")
  802. time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
  803. if model_fusion_statistics:
  804. csv_filename = args.fusion_csv or f"benchmark_fusion_{time_stamp}.csv"
  805. output_fusion_statistics(model_fusion_statistics, csv_filename)
  806. if len(results) == 0:
  807. if args.batch_sizes != [0]:
  808. logger.warning("No any result available.")
  809. return
  810. csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
  811. output_details(results, csv_filename)
  812. csv_filename = args.result_csv or f"benchmark_summary_{time_stamp}.csv"
  813. output_summary(results, csv_filename, args)
  814. if __name__ == "__main__":
  815. main()