notebook.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. # Copyright 2020 Hugging Face
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import re
  16. import time
  17. from typing import Optional, TypeVar
  18. import IPython.display as disp
  19. from ..trainer_callback import TrainerCallback
  20. from ..trainer_utils import IntervalStrategy, has_length
  21. _T = TypeVar("_T")
  22. def _require(x: _T | None, msg: str) -> _T:
  23. if x is None:
  24. raise RuntimeError(msg)
  25. return x
  26. def format_time(t):
  27. "Format `t` (in seconds) to (h):mm:ss"
  28. t = int(t)
  29. h, m, s = t // 3600, (t // 60) % 60, t % 60
  30. return f"{h}:{m:02d}:{s:02d}" if h != 0 else f"{m:02d}:{s:02d}"
  31. def html_progress_bar(value, total, prefix, label, width=300):
  32. # docstyle-ignore
  33. return f"""
  34. <div>
  35. {prefix}
  36. <progress value='{value}' max='{total}' style='width:{width}px; height:20px; vertical-align: middle;'></progress>
  37. {label}
  38. </div>
  39. """
  40. def text_to_html_table(items):
  41. "Put the texts in `items` in an HTML table."
  42. html_code = """<table border="1" class="dataframe">\n"""
  43. html_code += """ <thead>\n <tr style="text-align: left;">\n"""
  44. for i in items[0]:
  45. html_code += f" <th>{i}</th>\n"
  46. html_code += " </tr>\n </thead>\n <tbody>\n"
  47. for line in items[1:]:
  48. html_code += " <tr>\n"
  49. for elt in line:
  50. elt = f"{elt:.6f}" if isinstance(elt, float) else str(elt)
  51. html_code += f" <td>{elt}</td>\n"
  52. html_code += " </tr>\n"
  53. html_code += " </tbody>\n</table><p>"
  54. return html_code
  55. class NotebookProgressBar:
  56. """
  57. A progress par for display in a notebook.
  58. Class attributes (overridden by derived classes)
  59. - **warmup** (`int`) -- The number of iterations to do at the beginning while ignoring `update_every`.
  60. - **update_every** (`float`) -- Since calling the time takes some time, we only do it every presumed
  61. `update_every` seconds. The progress bar uses the average time passed up until now to guess the next value
  62. for which it will call the update.
  63. Args:
  64. total (`int`):
  65. The total number of iterations to reach.
  66. prefix (`str`, *optional*):
  67. A prefix to add before the progress bar.
  68. leave (`bool`, *optional*, defaults to `True`):
  69. Whether or not to leave the progress bar once it's completed. You can always call the
  70. [`~utils.notebook.NotebookProgressBar.close`] method to make the bar disappear.
  71. parent ([`~notebook.NotebookTrainingTracker`], *optional*):
  72. A parent object (like [`~utils.notebook.NotebookTrainingTracker`]) that spawns progress bars and handle
  73. their display. If set, the object passed must have a `display()` method.
  74. width (`int`, *optional*, defaults to 300):
  75. The width (in pixels) that the bar will take.
  76. Example:
  77. ```python
  78. import time
  79. pbar = NotebookProgressBar(100)
  80. for val in range(100):
  81. pbar.update(val)
  82. time.sleep(0.07)
  83. pbar.update(100)
  84. ```"""
  85. warmup = 5
  86. update_every = 0.2
  87. def __init__(
  88. self,
  89. total: int,
  90. prefix: str | None = None,
  91. leave: bool = True,
  92. parent: Optional["NotebookTrainingTracker"] = None,
  93. width: int = 300,
  94. ):
  95. self.total = total
  96. self.prefix = "" if prefix is None else prefix
  97. self.leave = leave
  98. self.parent = parent
  99. self.width = width
  100. self.last_value = None
  101. self.comment = None
  102. self.output = None
  103. self.value = None
  104. self.label = None
  105. if "VSCODE_PID" in os.environ:
  106. self.update_every = 0.5 # Adjusted for smooth updated as html rending is slow on VS Code
  107. # This is the only adjustment required to optimize training html rending
  108. def update(self, value: int, force_update: bool = False, comment: str | None = None):
  109. """
  110. The main method to update the progress bar to `value`.
  111. Args:
  112. value (`int`):
  113. The value to use. Must be between 0 and `total`.
  114. force_update (`bool`, *optional*, defaults to `False`):
  115. Whether or not to force and update of the internal state and display (by default, the bar will wait for
  116. `value` to reach the value it predicted corresponds to a time of more than the `update_every` attribute
  117. since the last update to avoid adding boilerplate).
  118. comment (`str`, *optional*):
  119. A comment to add on the left of the progress bar.
  120. """
  121. self.value = value
  122. if comment is not None:
  123. self.comment = comment
  124. if self.last_value is None:
  125. self.start_time = self.last_time = time.time()
  126. self.start_value = self.last_value = value
  127. self.elapsed_time = self.predicted_remaining = None
  128. self.first_calls = self.warmup
  129. self.wait_for = 1
  130. self.update_bar(value)
  131. elif value <= self.last_value and not force_update:
  132. return
  133. elif force_update or self.first_calls > 0 or value >= min(self.last_value + self.wait_for, self.total):
  134. if self.first_calls > 0:
  135. self.first_calls -= 1
  136. current_time = time.time()
  137. self.elapsed_time = current_time - self.start_time
  138. # We could have value = self.start_value if the update is called twixe with the same start value.
  139. if value > self.start_value:
  140. self.average_time_per_item = self.elapsed_time / (value - self.start_value)
  141. else:
  142. self.average_time_per_item = None
  143. if value >= self.total:
  144. value = self.total
  145. self.predicted_remaining = None
  146. if not self.leave:
  147. self.close()
  148. elif self.average_time_per_item is not None:
  149. self.predicted_remaining = self.average_time_per_item * (self.total - value)
  150. self.update_bar(value)
  151. self.last_value = value
  152. self.last_time = current_time
  153. if (self.average_time_per_item is None) or (self.average_time_per_item == 0):
  154. self.wait_for = 1
  155. else:
  156. self.wait_for = max(int(self.update_every / self.average_time_per_item), 1)
  157. def update_bar(self, value, comment=None):
  158. spaced_value = " " * (len(str(self.total)) - len(str(value))) + str(value)
  159. if self.elapsed_time is None:
  160. self.label = f"[{spaced_value}/{self.total} : < :"
  161. elif self.predicted_remaining is None:
  162. self.label = f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)}"
  163. else:
  164. self.label = (
  165. f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)} <"
  166. f" {format_time(self.predicted_remaining)}"
  167. )
  168. if self.average_time_per_item == 0:
  169. self.label += ", +inf it/s"
  170. else:
  171. self.label += f", {1 / self.average_time_per_item:.2f} it/s"
  172. self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
  173. self.display()
  174. def display(self):
  175. self.html_code = html_progress_bar(self.value, self.total, self.prefix, self.label, self.width)
  176. if self.parent is not None:
  177. # If this is a child bar, the parent will take care of the display.
  178. self.parent.display()
  179. return
  180. if self.output is None:
  181. self.output = disp.display(disp.HTML(self.html_code), display_id=True)
  182. else:
  183. self.output.update(disp.HTML(self.html_code))
  184. def close(self):
  185. "Closes the progress bar."
  186. if self.parent is None and self.output is not None:
  187. self.output.update(disp.HTML(""))
  188. class NotebookTrainingTracker(NotebookProgressBar):
  189. """
  190. An object tracking the updates of an ongoing training with progress bars and a nice table reporting metrics.
  191. Args:
  192. num_steps (`int`): The number of steps during training. column_names (`list[str]`, *optional*):
  193. The list of column names for the metrics table (will be inferred from the first call to
  194. [`~utils.notebook.NotebookTrainingTracker.write_line`] if not set).
  195. """
  196. def __init__(self, num_steps, column_names=None):
  197. super().__init__(num_steps)
  198. self.inner_table = None if column_names is None else [column_names]
  199. self.child_bar = None
  200. def display(self):
  201. self.html_code = html_progress_bar(self.value, self.total, self.prefix, self.label, self.width)
  202. if self.inner_table is not None:
  203. self.html_code += text_to_html_table(self.inner_table)
  204. if self.child_bar is not None:
  205. self.html_code += self.child_bar.html_code
  206. if self.output is None:
  207. self.output = disp.display(disp.HTML(self.html_code), display_id=True)
  208. else:
  209. self.output.update(disp.HTML(self.html_code))
  210. def write_line(self, values):
  211. """
  212. Write the values in the inner table.
  213. Args:
  214. values (`dict[str, float]`): The values to display.
  215. """
  216. if self.inner_table is None:
  217. self.inner_table = [list(values.keys()), list(values.values())]
  218. else:
  219. columns = self.inner_table[0]
  220. for key in values:
  221. if key not in columns:
  222. columns.append(key)
  223. self.inner_table[0] = columns
  224. if len(self.inner_table) > 1:
  225. last_values = self.inner_table[-1]
  226. first_column = self.inner_table[0][0]
  227. if last_values[0] != values[first_column]:
  228. # write new line
  229. self.inner_table.append([values.get(c, "No Log") for c in columns])
  230. else:
  231. # update last line
  232. new_values = values
  233. for c in columns:
  234. if c not in new_values:
  235. new_values[c] = last_values[columns.index(c)]
  236. self.inner_table[-1] = [new_values[c] for c in columns]
  237. else:
  238. self.inner_table.append([values[c] for c in columns])
  239. def add_child(self, total, prefix=None, width=300):
  240. """
  241. Add a child progress bar displayed under the table of metrics. The child progress bar is returned (so it can be
  242. easily updated).
  243. Args:
  244. total (`int`): The number of iterations for the child progress bar.
  245. prefix (`str`, *optional*): A prefix to write on the left of the progress bar.
  246. width (`int`, *optional*, defaults to 300): The width (in pixels) of the progress bar.
  247. """
  248. self.child_bar = NotebookProgressBar(total, prefix=prefix, parent=self, width=width)
  249. return self.child_bar
  250. def remove_child(self):
  251. """
  252. Closes the child progress bar.
  253. """
  254. self.child_bar = None
  255. self.display()
  256. class NotebookProgressCallback(TrainerCallback):
  257. """
  258. A [`TrainerCallback`] that displays the progress of training or evaluation, optimized for Jupyter Notebooks or
  259. Google colab.
  260. """
  261. def __init__(self):
  262. self.training_tracker = None
  263. self.prediction_bar = None
  264. self._force_next_update = False
  265. def on_train_begin(self, args, state, control, **kwargs):
  266. self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
  267. self.training_loss = 0
  268. self.last_log = 0
  269. column_names = [self.first_column] + ["Training Loss"]
  270. if args.eval_strategy != IntervalStrategy.NO:
  271. column_names.append("Validation Loss")
  272. self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names)
  273. def on_step_end(self, args, state, control, **kwargs):
  274. epoch = int(state.epoch) if int(state.epoch) == state.epoch else f"{state.epoch:.2f}"
  275. tt = _require(self.training_tracker, "on_train_begin must be called before on_step_end")
  276. tt.update(
  277. state.global_step + 1,
  278. comment=f"Epoch {epoch}/{state.num_train_epochs}",
  279. force_update=self._force_next_update,
  280. )
  281. self._force_next_update = False
  282. def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
  283. if not has_length(eval_dataloader):
  284. return
  285. if self.prediction_bar is None:
  286. if self.training_tracker is not None:
  287. self.prediction_bar = self.training_tracker.add_child(len(eval_dataloader))
  288. else:
  289. self.prediction_bar = NotebookProgressBar(len(eval_dataloader))
  290. self.prediction_bar.update(1)
  291. else:
  292. self.prediction_bar.update(self.prediction_bar.value + 1)
  293. def on_predict(self, args, state, control, **kwargs):
  294. if self.prediction_bar is not None:
  295. self.prediction_bar.close()
  296. self.prediction_bar = None
  297. def on_log(self, args, state, control, logs=None, **kwargs):
  298. # Only for when there is no evaluation
  299. if args.eval_strategy == IntervalStrategy.NO and "loss" in logs:
  300. tt = _require(self.training_tracker, "on_train_begin must be called before on_log")
  301. values = {"Training Loss": logs["loss"]}
  302. # First column is necessarily Step sine we're not in epoch eval strategy
  303. values["Step"] = state.global_step
  304. tt.write_line(values)
  305. def on_evaluate(self, args, state, control, metrics=None, **kwargs):
  306. tt = _require(self.training_tracker, "on_train_begin must be called before on_evaluate")
  307. values = {"Training Loss": "No log", "Validation Loss": "No log"}
  308. for log in reversed(state.log_history):
  309. if "loss" in log:
  310. values["Training Loss"] = log["loss"]
  311. break
  312. if self.first_column == "Epoch":
  313. values["Epoch"] = int(state.epoch)
  314. else:
  315. values["Step"] = state.global_step
  316. if metrics is None:
  317. metrics = {}
  318. metric_key_prefix = "eval"
  319. for k in metrics:
  320. if k.endswith("_loss"):
  321. metric_key_prefix = re.sub(r"\_loss$", "", k)
  322. _ = metrics.pop("total_flos", None)
  323. _ = metrics.pop("epoch", None)
  324. _ = metrics.pop(f"{metric_key_prefix}_runtime", None)
  325. _ = metrics.pop(f"{metric_key_prefix}_samples_per_second", None)
  326. _ = metrics.pop(f"{metric_key_prefix}_steps_per_second", None)
  327. for k, v in metrics.items():
  328. splits = k.split("_")
  329. name = " ".join([part.capitalize() for part in splits[1:]])
  330. if name == "Loss":
  331. # Single dataset
  332. name = "Validation Loss"
  333. values[name] = v
  334. tt.write_line(values)
  335. tt.remove_child()
  336. self.prediction_bar = None
  337. # Evaluation takes a long time so we should force the next update.
  338. self._force_next_update = True
  339. def on_train_end(self, args, state, control, **kwargs):
  340. tt = _require(self.training_tracker, "on_train_begin must be called before on_train_end")
  341. tt.update(
  342. state.global_step,
  343. comment=f"Epoch {int(state.epoch)}/{state.num_train_epochs}",
  344. force_update=True,
  345. )
  346. self.training_tracker = None