_buckets.py 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193
  1. # Copyright 2026-present, the HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Shared logic for bucket operations.
  15. This module contains the core buckets logic used by both the CLI and the Python API.
  16. """
  17. import fnmatch
  18. import json
  19. import mimetypes
  20. import os
  21. import sys
  22. import time
  23. from collections.abc import Iterator
  24. from dataclasses import dataclass, field
  25. from datetime import datetime, timezone
  26. from pathlib import Path
  27. from typing import TYPE_CHECKING, Any, Literal
  28. from . import constants, logging
  29. from .errors import BucketNotFoundError
  30. from .utils import XetFileData, disable_progress_bars, enable_progress_bars, parse_datetime
  31. from .utils._terminal import StatusLine
  32. if TYPE_CHECKING:
  33. from .hf_api import HfApi
  34. logger = logging.get_logger(__name__)
  35. BUCKET_PREFIX = "hf://buckets/"
  36. _SYNC_TIME_WINDOW_MS = 1000 # 1s safety-window for file modification time comparisons
  37. # =============================================================================
  38. # Bucket data structures
  39. # =============================================================================
  40. def _split_bucket_id_and_prefix(path: str) -> tuple[str, str]:
  41. """Split 'namespace/name(/optional/prefix)' into ('namespace/name', 'prefix').
  42. Returns (bucket_id, prefix) where prefix may be empty string.
  43. Raises ValueError if path doesn't contain at least namespace/name.
  44. """
  45. parts = path.split("/", 2)
  46. if len(parts) < 2 or not parts[0] or not parts[1]:
  47. raise ValueError(f"Invalid bucket path: '{path}'. Expected format: namespace/bucket_name")
  48. bucket_id = f"{parts[0]}/{parts[1]}"
  49. prefix = parts[2] if len(parts) > 2 else ""
  50. return bucket_id, prefix
  51. @dataclass
  52. class BucketInfo:
  53. """
  54. Contains information about a bucket on the Hub. This object is returned by [`bucket_info`] and [`list_buckets`].
  55. Attributes:
  56. id (`str`):
  57. ID of the bucket.
  58. private (`bool`):
  59. Is the bucket private.
  60. created_at (`datetime`):
  61. Date of creation of the bucket on the Hub.
  62. size (`int`):
  63. Size of the bucket in bytes.
  64. total_files (`int`):
  65. Total number of files in the bucket.
  66. """
  67. id: str
  68. private: bool
  69. created_at: datetime
  70. size: int
  71. total_files: int
  72. def __init__(self, **kwargs):
  73. self.id = kwargs.pop("id")
  74. self.private = kwargs.pop("private")
  75. self.created_at = parse_datetime(kwargs.pop("createdAt"))
  76. self.size = kwargs.pop("size")
  77. self.total_files = kwargs.pop("totalFiles")
  78. self.__dict__.update(**kwargs)
  79. @dataclass
  80. class _BucketAddFile:
  81. source: str | Path | bytes
  82. destination: str
  83. xet_hash: str | None = field(default=None)
  84. size: int | None = field(default=None)
  85. mtime: int = field(init=False)
  86. content_type: str | None = field(init=False)
  87. def __post_init__(self) -> None:
  88. self.content_type = None
  89. if isinstance(self.source, (str, Path)): # guess content type from source path
  90. self.content_type = mimetypes.guess_type(self.source)[0]
  91. if self.content_type is None: # or default to destination path content type
  92. self.content_type = mimetypes.guess_type(self.destination)[0]
  93. self.mtime = int(
  94. os.path.getmtime(self.source) * 1000 if not isinstance(self.source, bytes) else time.time() * 1000
  95. )
  96. @dataclass
  97. class _BucketCopyFile:
  98. destination: str
  99. xet_hash: str
  100. source_repo_type: str # "model", "dataset", "space", "bucket"
  101. source_repo_id: str
  102. size: int | None = field(default=None)
  103. mtime: int = field(init=False)
  104. content_type: str | None = field(init=False)
  105. def __post_init__(self) -> None:
  106. self.content_type = mimetypes.guess_type(self.destination)[0]
  107. self.mtime = int(time.time() * 1000)
  108. @dataclass
  109. class _BucketDeleteFile:
  110. path: str
  111. @dataclass(frozen=True)
  112. class BucketFileMetadata:
  113. """Data structure containing information about a file in a bucket.
  114. Returned by [`get_bucket_file_metadata`].
  115. Args:
  116. size (`int`):
  117. Size of the file in bytes.
  118. xet_file_data (`XetFileData`):
  119. Xet information for the file (hash and refresh route).
  120. """
  121. size: int
  122. xet_file_data: XetFileData
  123. @dataclass
  124. class BucketUrl:
  125. """Describes a bucket URL on the Hub.
  126. `BucketUrl` is returned by [`create_bucket`]. At initialization, the URL is parsed to populate properties:
  127. - endpoint (`str`)
  128. - namespace (`str`)
  129. - bucket_id (`str`)
  130. - url (`str`)
  131. - handle (`str`)
  132. Args:
  133. url (`str`):
  134. String value of the bucket url.
  135. endpoint (`str`, *optional*):
  136. Endpoint of the Hub. Defaults to <https://huggingface.co>.
  137. """
  138. url: str
  139. endpoint: str = ""
  140. namespace: str = field(init=False)
  141. bucket_id: str = field(init=False)
  142. handle: str = field(init=False)
  143. def __post_init__(self) -> None:
  144. self.endpoint = self.endpoint or constants.ENDPOINT
  145. # Parse URL: expected format is `{endpoint}/buckets/{namespace}/{bucket_name}`
  146. url_path = self.url.replace(self.endpoint, "").strip("/")
  147. # Remove leading "buckets/" prefix
  148. if url_path.startswith("buckets/"):
  149. url_path = url_path[len("buckets/") :]
  150. bucket_id, prefix = _split_bucket_id_and_prefix(url_path)
  151. if prefix:
  152. raise ValueError(f"Unable to parse bucket URL: {self.url}")
  153. self.namespace = bucket_id.split("/")[0]
  154. self.bucket_id = bucket_id
  155. self.handle = f"hf://buckets/{self.bucket_id}"
  156. @dataclass
  157. class BucketFile:
  158. """
  159. Contains information about a file in a bucket on the Hub. This object is returned by [`list_bucket_tree`].
  160. Similar to [`RepoFile`] but for files in buckets.
  161. """
  162. type: Literal["file"]
  163. path: str
  164. size: int
  165. xet_hash: str
  166. mtime: datetime | None
  167. uploaded_at: datetime | None
  168. def __init__(self, **kwargs):
  169. self.type = kwargs.pop("type")
  170. self.path = kwargs.pop("path")
  171. self.size = kwargs.pop("size")
  172. self.xet_hash = kwargs.pop("xetHash")
  173. mtime = kwargs.pop("mtime", None)
  174. self.mtime = parse_datetime(mtime) if mtime else None
  175. uploaded_at = kwargs.pop("uploadedAt", None)
  176. self.uploaded_at = parse_datetime(uploaded_at) if uploaded_at else None
  177. @dataclass
  178. class BucketFolder:
  179. """
  180. Contains information about a directory in a bucket on the Hub. This object is returned by [`list_bucket_tree`].
  181. Similar to [`RepoFolder`] but for directories in buckets.
  182. """
  183. type: Literal["directory"]
  184. path: str
  185. uploaded_at: datetime | None
  186. def __init__(self, **kwargs):
  187. self.type = kwargs.pop("type")
  188. self.path = kwargs.pop("path")
  189. uploaded_at = kwargs.pop("uploadedAt", None) or kwargs.pop("uploaded_at", None)
  190. self.uploaded_at = (
  191. (uploaded_at if isinstance(uploaded_at, datetime) else parse_datetime(uploaded_at))
  192. if uploaded_at
  193. else None
  194. )
  195. # =============================================================================
  196. # Bucket path parsing
  197. # =============================================================================
  198. def _parse_bucket_path(path: str) -> tuple[str, str]:
  199. """Parse a bucket path like hf://buckets/namespace/bucket_name/prefix into (bucket_id, prefix).
  200. Returns:
  201. tuple: (bucket_id, prefix) where bucket_id is "namespace/bucket_name" and prefix may be empty string.
  202. """
  203. if not path.startswith(BUCKET_PREFIX):
  204. raise ValueError(f"Invalid bucket path: {path}. Must start with {BUCKET_PREFIX}")
  205. return _split_bucket_id_and_prefix(path.removeprefix(BUCKET_PREFIX))
  206. def _is_bucket_path(path: str) -> bool:
  207. """Check if a path is a bucket path."""
  208. return path.startswith(BUCKET_PREFIX)
  209. # =============================================================================
  210. # Sync data structures
  211. # =============================================================================
  212. @dataclass
  213. class SyncOperation:
  214. """Represents a sync operation to be performed."""
  215. action: Literal["upload", "download", "delete", "skip"]
  216. path: str
  217. size: int | None = None
  218. reason: str = ""
  219. local_mtime: str | None = None
  220. remote_mtime: str | None = None
  221. bucket_file: BucketFile | None = None # BucketFile when available (not serialized to plan file)
  222. @dataclass
  223. class SyncPlan:
  224. """Represents a complete sync plan."""
  225. source: str
  226. dest: str
  227. timestamp: str
  228. operations: list[SyncOperation] = field(default_factory=list)
  229. def summary(self) -> dict[str, int | str]:
  230. uploads = sum(1 for op in self.operations if op.action == "upload")
  231. downloads = sum(1 for op in self.operations if op.action == "download")
  232. deletes = sum(1 for op in self.operations if op.action == "delete")
  233. skips = sum(1 for op in self.operations if op.action == "skip")
  234. total_size = sum(op.size or 0 for op in self.operations if op.action in ("upload", "download"))
  235. return {
  236. "uploads": uploads,
  237. "downloads": downloads,
  238. "deletes": deletes,
  239. "skips": skips,
  240. "total_size": total_size,
  241. }
  242. # =============================================================================
  243. # Filter matching
  244. # =============================================================================
  245. class FilterMatcher:
  246. """Matches file paths against include/exclude patterns."""
  247. def __init__(
  248. self,
  249. include_patterns: list[str] | None = None,
  250. exclude_patterns: list[str] | None = None,
  251. filter_rules: list[tuple[str, str]] | None = None,
  252. ):
  253. """Initialize the filter matcher.
  254. Args:
  255. include_patterns: Patterns to include (from --include)
  256. exclude_patterns: Patterns to exclude (from --exclude)
  257. filter_rules: Rules from filter file as list of ("+"/"-", pattern) tuples
  258. """
  259. self.include_patterns = include_patterns or []
  260. self.exclude_patterns = exclude_patterns or []
  261. self.filter_rules = filter_rules or []
  262. def matches(self, path: str) -> bool:
  263. """Check if a path should be included based on the filter rules.
  264. Filtering rules:
  265. - Filters are evaluated in order, first matching rule decides
  266. - If no rules match, include by default (unless include patterns are specified)
  267. """
  268. # First check filter rules from file (in order)
  269. for sign, pattern in self.filter_rules:
  270. if fnmatch.fnmatch(path, pattern):
  271. return sign == "+"
  272. # Then check CLI patterns
  273. for pattern in self.exclude_patterns:
  274. if fnmatch.fnmatch(path, pattern):
  275. return False
  276. for pattern in self.include_patterns:
  277. if fnmatch.fnmatch(path, pattern):
  278. return True
  279. # If include patterns were specified but none matched, exclude
  280. if self.include_patterns:
  281. return False
  282. # Default: include
  283. return True
  284. def _parse_filter_file(filter_file: str) -> list[tuple[str, str]]:
  285. """Parse a filter file and return a list of (sign, pattern) tuples.
  286. Filter file format:
  287. - Lines starting with "+" are include patterns
  288. - Lines starting with "-" are exclude patterns
  289. - Empty lines and lines starting with "#" are ignored
  290. """
  291. rules = []
  292. with open(filter_file) as f:
  293. for line in f:
  294. line = line.strip()
  295. if not line or line.startswith("#"):
  296. continue
  297. if line.startswith("+"):
  298. rules.append(("+", line[1:].strip()))
  299. elif line.startswith("-"):
  300. rules.append(("-", line[1:].strip()))
  301. else:
  302. # Default to include if no prefix
  303. rules.append(("+", line))
  304. return rules
  305. # =============================================================================
  306. # File listing
  307. # =============================================================================
  308. def _list_local_files(local_path: str) -> Iterator[tuple[str, int, float]]:
  309. """List all files in a local directory.
  310. Yields:
  311. tuple: (relative_path, size, mtime_ms) for each file
  312. """
  313. local_path = os.path.abspath(local_path)
  314. if not os.path.isdir(local_path):
  315. raise ValueError(f"Local path must be a directory: {local_path}")
  316. for root, _, files in os.walk(local_path):
  317. for filename in files:
  318. full_path = os.path.join(root, filename)
  319. rel_path = os.path.relpath(full_path, local_path)
  320. # Normalize to forward slashes for consistency
  321. rel_path = rel_path.replace(os.sep, "/")
  322. size = os.path.getsize(full_path)
  323. mtime_ms = os.path.getmtime(full_path) * 1000
  324. yield rel_path, size, mtime_ms
  325. def _list_remote_files(api: "HfApi", bucket_id: str, prefix: str) -> Iterator[tuple[str, int, float, Any]]:
  326. """List all files in a bucket with a given prefix.
  327. Yields:
  328. tuple: (relative_path, size, mtime_ms, bucket_file) for each file.
  329. bucket_file is the BucketFile object from list_bucket_tree.
  330. """
  331. for item in api.list_bucket_tree(bucket_id, prefix=prefix or None, recursive=True):
  332. if isinstance(item, BucketFolder):
  333. continue
  334. path = item.path
  335. # Remove prefix from path to get relative path
  336. # Only strip prefix if it's followed by "/" (directory boundary) or is exact match
  337. if prefix:
  338. if path.startswith(prefix + "/"):
  339. rel_path = path[len(prefix) + 1 :]
  340. elif path == prefix:
  341. # Exact match: the file IS the prefix (e.g., single file download)
  342. rel_path = path.rsplit("/", 1)[-1] if "/" in path else path
  343. else:
  344. # Path doesn't match prefix pattern (e.g., "submarine.txt" for prefix "sub")
  345. # Skip this file - it was returned by the API but doesn't belong to this prefix
  346. continue
  347. else:
  348. rel_path = path
  349. mtime_ms = item.mtime.timestamp() * 1000 if item.mtime else 0
  350. yield rel_path, item.size, mtime_ms, item
  351. # =============================================================================
  352. # Sync plan computation
  353. # =============================================================================
  354. def _mtime_to_iso(mtime_ms: float) -> str:
  355. """Convert mtime in milliseconds to ISO format string."""
  356. return datetime.fromtimestamp(mtime_ms / 1000, tz=timezone.utc).isoformat()
  357. def _compare_files_for_sync(
  358. *,
  359. path: str,
  360. action: Literal["upload", "download"],
  361. source_size: int,
  362. source_mtime: float,
  363. dest_size: int,
  364. dest_mtime: float,
  365. source_newer_label: str,
  366. dest_newer_label: str,
  367. ignore_sizes: bool,
  368. ignore_times: bool,
  369. ignore_existing: bool,
  370. bucket_file: Any | None = None,
  371. ) -> SyncOperation:
  372. """Compare source and dest files and return the appropriate sync operation.
  373. This is a unified helper for both upload and download directions.
  374. Args:
  375. path: Relative file path
  376. action: "upload" or "download"
  377. source_size: Size of the source file (bytes)
  378. source_mtime: Mtime of the source file (milliseconds)
  379. dest_size: Size of the destination file (bytes)
  380. dest_mtime: Mtime of the destination file (milliseconds)
  381. source_newer_label: Label when source is newer (e.g., "local newer" or "remote newer")
  382. dest_newer_label: Label when dest is newer (e.g., "remote newer" or "local newer")
  383. ignore_sizes: Only compare mtime
  384. ignore_times: Only compare size
  385. ignore_existing: Skip files that exist on receiver
  386. bucket_file: BucketFile object (for downloads only)
  387. Returns:
  388. SyncOperation describing the action to take
  389. """
  390. local_mtime_iso = _mtime_to_iso(source_mtime if action == "upload" else dest_mtime)
  391. remote_mtime_iso = _mtime_to_iso(dest_mtime if action == "upload" else source_mtime)
  392. base_kwargs: dict[str, Any] = {
  393. "path": path,
  394. "size": source_size,
  395. "local_mtime": local_mtime_iso,
  396. "remote_mtime": remote_mtime_iso,
  397. }
  398. if ignore_existing:
  399. return SyncOperation(action="skip", reason="exists on receiver (--ignore-existing)", **base_kwargs)
  400. size_differs = source_size != dest_size
  401. source_newer = (source_mtime - dest_mtime) > _SYNC_TIME_WINDOW_MS
  402. if ignore_sizes:
  403. if source_newer:
  404. return SyncOperation(action=action, reason=source_newer_label, bucket_file=bucket_file, **base_kwargs)
  405. else:
  406. dest_newer = (dest_mtime - source_mtime) > _SYNC_TIME_WINDOW_MS
  407. skip_reason = dest_newer_label if dest_newer else "same mtime"
  408. return SyncOperation(action="skip", reason=skip_reason, **base_kwargs)
  409. elif ignore_times:
  410. if size_differs:
  411. return SyncOperation(action=action, reason="size differs", bucket_file=bucket_file, **base_kwargs)
  412. else:
  413. return SyncOperation(action="skip", reason="same size", **base_kwargs)
  414. else:
  415. if size_differs or source_newer:
  416. reason = "size differs" if size_differs else source_newer_label
  417. return SyncOperation(action=action, reason=reason, bucket_file=bucket_file, **base_kwargs)
  418. else:
  419. return SyncOperation(action="skip", reason="identical", **base_kwargs)
  420. def _compute_sync_plan(
  421. source: str,
  422. dest: str,
  423. api: "HfApi",
  424. delete: bool = False,
  425. ignore_times: bool = False,
  426. ignore_sizes: bool = False,
  427. existing: bool = False,
  428. ignore_existing: bool = False,
  429. filter_matcher: FilterMatcher | None = None,
  430. status: Any | None = None,
  431. ) -> SyncPlan:
  432. """Compute the sync plan by comparing source and destination.
  433. Returns:
  434. SyncPlan with all operations to be performed
  435. """
  436. filter_matcher = filter_matcher or FilterMatcher()
  437. is_upload = not _is_bucket_path(source) and _is_bucket_path(dest)
  438. is_download = _is_bucket_path(source) and not _is_bucket_path(dest)
  439. if not is_upload and not is_download:
  440. raise ValueError("One of source or dest must be a bucket path (hf://buckets/...) and the other must be local.")
  441. plan = SyncPlan(
  442. source=source,
  443. dest=dest,
  444. timestamp=datetime.now(timezone.utc).isoformat(),
  445. )
  446. remote_total: int | None = None
  447. if is_upload:
  448. # Local -> Remote
  449. local_path = os.path.abspath(source)
  450. bucket_id, prefix = _parse_bucket_path(dest)
  451. if not os.path.isdir(local_path):
  452. raise ValueError(f"Source must be a directory: {local_path}")
  453. # Get local and remote file lists
  454. local_files = {}
  455. for rel_path, size, mtime_ms in _list_local_files(local_path):
  456. if filter_matcher.matches(rel_path):
  457. local_files[rel_path] = (size, mtime_ms)
  458. if status:
  459. status.update(f"Scanning local directory ({len(local_files)} files)")
  460. if status:
  461. status.done(f"Scanning local directory ({len(local_files)} files)")
  462. remote_files = {}
  463. if status:
  464. try:
  465. remote_total = api.bucket_info(bucket_id).total_files
  466. except Exception:
  467. pass
  468. try:
  469. for rel_path, size, mtime_ms, _ in _list_remote_files(api, bucket_id, prefix):
  470. if filter_matcher.matches(rel_path):
  471. remote_files[rel_path] = (size, mtime_ms)
  472. if status:
  473. total_str = f"/{remote_total}" if remote_total is not None else ""
  474. status.update(f"Scanning remote bucket ({len(remote_files)}{total_str} files)")
  475. except BucketNotFoundError:
  476. # Bucket doesn't exist yet - this is expected for new uploads
  477. logger.debug(f"Bucket '{bucket_id}' not found, treating as empty.")
  478. if status:
  479. status.done(f"Scanning remote bucket ({len(remote_files)} files)")
  480. # Compare files
  481. all_paths = set(local_files.keys()) | set(remote_files.keys())
  482. if status:
  483. status.done(f"Comparing files ({len(all_paths)} paths)")
  484. for path in sorted(all_paths):
  485. local_info = local_files.get(path)
  486. remote_info = remote_files.get(path)
  487. if local_info and not remote_info:
  488. # New file
  489. if existing:
  490. # --existing: skip new files
  491. plan.operations.append(
  492. SyncOperation(
  493. action="skip",
  494. path=path,
  495. size=local_info[0],
  496. reason="new file (--existing)",
  497. local_mtime=_mtime_to_iso(local_info[1]),
  498. )
  499. )
  500. else:
  501. plan.operations.append(
  502. SyncOperation(
  503. action="upload",
  504. path=path,
  505. size=local_info[0],
  506. reason="new file",
  507. local_mtime=_mtime_to_iso(local_info[1]),
  508. )
  509. )
  510. elif local_info and remote_info:
  511. # File exists in both - use helper to determine action
  512. local_size, local_mtime = local_info
  513. remote_size, remote_mtime = remote_info
  514. plan.operations.append(
  515. _compare_files_for_sync(
  516. path=path,
  517. action="upload",
  518. source_size=local_size,
  519. source_mtime=local_mtime,
  520. dest_size=remote_size,
  521. dest_mtime=remote_mtime,
  522. source_newer_label="local newer",
  523. dest_newer_label="remote newer",
  524. ignore_sizes=ignore_sizes,
  525. ignore_times=ignore_times,
  526. ignore_existing=ignore_existing,
  527. )
  528. )
  529. elif not local_info and remote_info and delete:
  530. # File only in remote and --delete mode
  531. plan.operations.append(
  532. SyncOperation(
  533. action="delete",
  534. path=path,
  535. size=remote_info[0],
  536. reason="not in source (--delete)",
  537. remote_mtime=_mtime_to_iso(remote_info[1]),
  538. )
  539. )
  540. else:
  541. # Remote -> Local (download)
  542. bucket_id, prefix = _parse_bucket_path(source)
  543. local_path = os.path.abspath(dest)
  544. # Get remote and local file lists
  545. remote_files = {}
  546. bucket_file_map: dict[str, Any] = {}
  547. if status:
  548. try:
  549. remote_total = api.bucket_info(bucket_id).total_files
  550. except Exception:
  551. pass
  552. for rel_path, size, mtime_ms, bucket_file in _list_remote_files(api, bucket_id, prefix):
  553. if filter_matcher.matches(rel_path):
  554. remote_files[rel_path] = (size, mtime_ms)
  555. bucket_file_map[rel_path] = bucket_file
  556. if status:
  557. total_str = f"/{remote_total}" if remote_total is not None else ""
  558. status.update(f"Scanning remote bucket ({len(remote_files)}{total_str} files)")
  559. if status:
  560. status.done(f"Scanning remote bucket ({len(remote_files)} files)")
  561. local_files = {}
  562. if os.path.isdir(local_path):
  563. for rel_path, size, mtime_ms in _list_local_files(local_path):
  564. if filter_matcher.matches(rel_path):
  565. local_files[rel_path] = (size, mtime_ms)
  566. if status:
  567. status.update(f"Scanning local directory ({len(local_files)} files)")
  568. if status:
  569. status.done(f"Scanning local directory ({len(local_files)} files)")
  570. # Compare files
  571. all_paths = set(remote_files.keys()) | set(local_files.keys())
  572. if status:
  573. status.done(f"Comparing files ({len(all_paths)} paths)")
  574. for path in sorted(all_paths):
  575. remote_info = remote_files.get(path)
  576. local_info = local_files.get(path)
  577. if remote_info and not local_info:
  578. # New file
  579. if existing:
  580. # --existing: skip new files
  581. plan.operations.append(
  582. SyncOperation(
  583. action="skip",
  584. path=path,
  585. size=remote_info[0],
  586. reason="new file (--existing)",
  587. remote_mtime=_mtime_to_iso(remote_info[1]),
  588. )
  589. )
  590. else:
  591. plan.operations.append(
  592. SyncOperation(
  593. action="download",
  594. path=path,
  595. size=remote_info[0],
  596. reason="new file",
  597. remote_mtime=_mtime_to_iso(remote_info[1]),
  598. bucket_file=bucket_file_map.get(path),
  599. )
  600. )
  601. elif remote_info and local_info:
  602. # File exists in both - use helper to determine action
  603. remote_size, remote_mtime = remote_info
  604. local_size, local_mtime = local_info
  605. plan.operations.append(
  606. _compare_files_for_sync(
  607. path=path,
  608. action="download",
  609. source_size=remote_size,
  610. source_mtime=remote_mtime,
  611. dest_size=local_size,
  612. dest_mtime=local_mtime,
  613. source_newer_label="remote newer",
  614. dest_newer_label="local newer",
  615. ignore_sizes=ignore_sizes,
  616. ignore_times=ignore_times,
  617. ignore_existing=ignore_existing,
  618. bucket_file=bucket_file_map.get(path),
  619. )
  620. )
  621. elif not remote_info and local_info and delete:
  622. # File only in local and --delete mode
  623. plan.operations.append(
  624. SyncOperation(
  625. action="delete",
  626. path=path,
  627. size=local_info[0],
  628. reason="not in source (--delete)",
  629. local_mtime=_mtime_to_iso(local_info[1]),
  630. )
  631. )
  632. return plan
  633. # =============================================================================
  634. # Plan serialization
  635. # =============================================================================
  636. def _write_plan(plan: SyncPlan, f) -> None:
  637. """Write a sync plan as JSONL to a file-like object."""
  638. # Write header
  639. header = {
  640. "type": "header",
  641. "source": plan.source,
  642. "dest": plan.dest,
  643. "timestamp": plan.timestamp,
  644. "summary": plan.summary(),
  645. }
  646. f.write(json.dumps(header) + "\n")
  647. # Write operations
  648. for op in plan.operations:
  649. op_dict: dict[str, Any] = {
  650. "type": "operation",
  651. "action": op.action,
  652. "path": op.path,
  653. "reason": op.reason,
  654. }
  655. if op.size is not None:
  656. op_dict["size"] = op.size
  657. if op.local_mtime is not None:
  658. op_dict["local_mtime"] = op.local_mtime
  659. if op.remote_mtime is not None:
  660. op_dict["remote_mtime"] = op.remote_mtime
  661. f.write(json.dumps(op_dict) + "\n")
  662. def _save_plan(plan: SyncPlan, plan_file: str) -> None:
  663. """Save a sync plan to a JSONL file."""
  664. with open(plan_file, "w") as f:
  665. _write_plan(plan, f)
  666. def _load_plan(plan_file: str) -> SyncPlan:
  667. """Load a sync plan from a JSONL file."""
  668. with open(plan_file) as f:
  669. lines = f.readlines()
  670. if not lines:
  671. raise ValueError(f"Empty plan file: {plan_file}")
  672. # Parse header
  673. header = json.loads(lines[0])
  674. if header.get("type") != "header":
  675. raise ValueError("Invalid plan file: expected header as first line")
  676. plan = SyncPlan(
  677. source=header["source"],
  678. dest=header["dest"],
  679. timestamp=header["timestamp"],
  680. )
  681. # Parse operations
  682. for line in lines[1:]:
  683. op_dict = json.loads(line)
  684. if op_dict.get("type") != "operation":
  685. continue
  686. plan.operations.append(
  687. SyncOperation(
  688. action=op_dict["action"],
  689. path=op_dict["path"],
  690. size=op_dict.get("size"),
  691. reason=op_dict.get("reason", ""),
  692. local_mtime=op_dict.get("local_mtime"),
  693. remote_mtime=op_dict.get("remote_mtime"),
  694. )
  695. )
  696. return plan
  697. # =============================================================================
  698. # Plan execution
  699. # =============================================================================
  700. def _execute_plan(plan: SyncPlan, api: "HfApi", verbose: bool = False, status: Any | None = None) -> None:
  701. """Execute a sync plan."""
  702. is_upload = not _is_bucket_path(plan.source) and _is_bucket_path(plan.dest)
  703. is_download = _is_bucket_path(plan.source) and not _is_bucket_path(plan.dest)
  704. if is_upload:
  705. local_path = os.path.abspath(plan.source)
  706. bucket_id, prefix = _parse_bucket_path(plan.dest)
  707. prefix = prefix.rstrip("/") # Avoid double slashes in remote paths
  708. # Collect operations
  709. add_files: list[tuple[str | Path | bytes, str]] = []
  710. delete_paths: list[str] = []
  711. for op in plan.operations:
  712. match op.action:
  713. case "upload":
  714. local_file = os.path.join(local_path, op.path)
  715. remote_path = f"{prefix}/{op.path}" if prefix else op.path
  716. if verbose:
  717. print(f" Uploading: {op.path} ({op.reason})")
  718. add_files.append((local_file, remote_path))
  719. case "delete":
  720. remote_path = f"{prefix}/{op.path}" if prefix else op.path
  721. if verbose:
  722. print(f" Deleting: {op.path} ({op.reason})")
  723. delete_paths.append(remote_path)
  724. case "skip" if verbose:
  725. print(f" Skipping: {op.path} ({op.reason})")
  726. # Execute batch operations
  727. if add_files or delete_paths:
  728. if status:
  729. parts = []
  730. if add_files:
  731. parts.append(f"uploading {len(add_files)} files")
  732. if delete_paths:
  733. parts.append(f"deleting {len(delete_paths)} files")
  734. status.done(", ".join(parts).capitalize())
  735. api.batch_bucket_files(
  736. bucket_id,
  737. add=add_files or None,
  738. delete=delete_paths or None,
  739. )
  740. elif is_download:
  741. bucket_id, prefix = _parse_bucket_path(plan.source)
  742. prefix = prefix.rstrip("/") # Avoid double slashes in remote paths
  743. local_path = os.path.abspath(plan.dest)
  744. # Ensure local directory exists
  745. os.makedirs(local_path, exist_ok=True)
  746. # Collect download operations
  747. download_files: list[tuple[str | BucketFile, str | Path]] = []
  748. delete_files: list[str] = []
  749. for op in plan.operations:
  750. if op.action == "download":
  751. local_file = os.path.join(local_path, op.path)
  752. # Ensure parent directory exists
  753. os.makedirs(os.path.dirname(local_file), exist_ok=True)
  754. if verbose:
  755. print(f" Downloading: {op.path} ({op.reason})")
  756. # Use BucketFile when available (avoids extra metadata fetch per file)
  757. if op.bucket_file is not None:
  758. download_files.append((op.bucket_file, local_file))
  759. else:
  760. remote_path = f"{prefix}/{op.path}" if prefix else op.path
  761. download_files.append((remote_path, local_file))
  762. elif op.action == "delete":
  763. local_file = os.path.join(local_path, op.path)
  764. if verbose:
  765. print(f" Deleting: {op.path} ({op.reason})")
  766. delete_files.append(local_file)
  767. elif op.action == "skip" and verbose:
  768. print(f" Skipping: {op.path} ({op.reason})")
  769. # Execute downloads
  770. if len(download_files) > 0:
  771. if status:
  772. status.done(f"Downloading {len(download_files)} files")
  773. api.download_bucket_files(bucket_id, download_files)
  774. # Execute deletes
  775. if status and delete_files:
  776. status.done(f"Deleting {len(delete_files)} local files")
  777. for file_path in delete_files:
  778. if os.path.exists(file_path):
  779. os.remove(file_path)
  780. # Remove empty parent directories
  781. parent = os.path.dirname(file_path)
  782. while parent != local_path:
  783. try:
  784. os.rmdir(parent)
  785. parent = os.path.dirname(parent)
  786. except OSError:
  787. break
  788. def _print_plan_summary(plan: SyncPlan) -> None:
  789. """Print a summary of the sync plan."""
  790. summary = plan.summary()
  791. print(f"Sync plan: {plan.source} -> {plan.dest}")
  792. print(f" Uploads: {summary['uploads']}")
  793. print(f" Downloads: {summary['downloads']}")
  794. print(f" Deletes: {summary['deletes']}")
  795. print(f" Skips: {summary['skips']}")
  796. # =============================================================================
  797. # Public sync function (Python API)
  798. # =============================================================================
  799. def sync_bucket_internal(
  800. source: str | None = None,
  801. dest: str | None = None,
  802. *,
  803. api: "HfApi",
  804. delete: bool = False,
  805. ignore_times: bool = False,
  806. ignore_sizes: bool = False,
  807. existing: bool = False,
  808. ignore_existing: bool = False,
  809. include: list[str] | None = None,
  810. exclude: list[str] | None = None,
  811. filter_from: str | None = None,
  812. plan: str | None = None,
  813. apply: str | None = None,
  814. dry_run: bool = False,
  815. verbose: bool = False,
  816. quiet: bool = False,
  817. token: bool | str | None = None,
  818. ) -> SyncPlan:
  819. """Sync files between a local directory and a bucket.
  820. This is equivalent to the ``hf buckets sync`` CLI command. One of ``source`` or ``dest`` must be a bucket path
  821. (``hf://buckets/...``) and the other must be a local directory path.
  822. Args:
  823. source (`str`, *optional*):
  824. Source path: local directory or ``hf://buckets/namespace/bucket_name(/prefix)``.
  825. Required unless using ``apply``.
  826. dest (`str`, *optional*):
  827. Destination path: local directory or ``hf://buckets/namespace/bucket_name(/prefix)``.
  828. Required unless using ``apply``.
  829. api ([`HfApi`]):
  830. The HfApi instance to use for API calls.
  831. delete (`bool`, *optional*, defaults to `False`):
  832. Delete destination files not present in source.
  833. ignore_times (`bool`, *optional*, defaults to `False`):
  834. Skip files only based on size, ignoring modification times.
  835. ignore_sizes (`bool`, *optional*, defaults to `False`):
  836. Skip files only based on modification times, ignoring sizes.
  837. existing (`bool`, *optional*, defaults to `False`):
  838. Skip creating new files on receiver (only update existing files).
  839. ignore_existing (`bool`, *optional*, defaults to `False`):
  840. Skip updating files that exist on receiver (only create new files).
  841. include (`list[str]`, *optional*):
  842. Include files matching patterns (fnmatch-style).
  843. exclude (`list[str]`, *optional*):
  844. Exclude files matching patterns (fnmatch-style).
  845. filter_from (`str`, *optional*):
  846. Path to a filter file with include/exclude rules.
  847. plan (`str`, *optional*):
  848. Save sync plan to this JSONL file instead of executing.
  849. apply (`str`, *optional*):
  850. Apply a previously saved plan file. When set, ``source`` and ``dest`` are not needed.
  851. dry_run (`bool`, *optional*, defaults to `False`):
  852. Print sync plan to stdout as JSONL without executing.
  853. verbose (`bool`, *optional*, defaults to `False`):
  854. Show detailed per-file operations.
  855. quiet (`bool`, *optional*, defaults to `False`):
  856. Suppress all output and progress bars.
  857. token (Union[bool, str, None], optional):
  858. A valid user access token. If not provided, the locally saved token will be used.
  859. Returns:
  860. [`SyncPlan`]: The computed (or loaded) sync plan.
  861. Raises:
  862. `ValueError`: If arguments are invalid (e.g., both paths are remote, conflicting options).
  863. Example:
  864. ```python
  865. >>> from huggingface_hub import HfApi
  866. >>> api = HfApi()
  867. # Upload local directory to bucket
  868. >>> api.sync_bucket("./data", "hf://buckets/username/my-bucket")
  869. # Download bucket to local directory
  870. >>> api.sync_bucket("hf://buckets/username/my-bucket", "./data")
  871. # Sync with delete and filtering
  872. >>> api.sync_bucket(
  873. ... "./data",
  874. ... "hf://buckets/username/my-bucket",
  875. ... delete=True,
  876. ... include=["*.safetensors"],
  877. ... )
  878. # Dry run: preview what would be synced
  879. >>> plan = api.sync_bucket("./data", "hf://buckets/username/my-bucket", dry_run=True)
  880. >>> plan.summary()
  881. {'uploads': 3, 'downloads': 0, 'deletes': 0, 'skips': 1, 'total_size': 4096}
  882. # Save plan for review, then apply
  883. >>> api.sync_bucket("./data", "hf://buckets/username/my-bucket", plan="sync-plan.jsonl")
  884. >>> api.sync_bucket(apply="sync-plan.jsonl")
  885. ```
  886. """
  887. # Build API with token if needed
  888. if token is not None:
  889. from .hf_api import HfApi
  890. api = HfApi(token=token)
  891. # --- Apply mode ---
  892. if apply:
  893. if source or dest:
  894. raise ValueError("Cannot specify source/dest when using apply.")
  895. if plan is not None:
  896. raise ValueError("Cannot specify both plan and apply.")
  897. if delete:
  898. raise ValueError("Cannot specify delete when using apply.")
  899. if ignore_times:
  900. raise ValueError("Cannot specify ignore_times when using apply.")
  901. if ignore_sizes:
  902. raise ValueError("Cannot specify ignore_sizes when using apply.")
  903. if include:
  904. raise ValueError("Cannot specify include when using apply.")
  905. if exclude:
  906. raise ValueError("Cannot specify exclude when using apply.")
  907. if filter_from:
  908. raise ValueError("Cannot specify filter_from when using apply.")
  909. if existing:
  910. raise ValueError("Cannot specify existing when using apply.")
  911. if ignore_existing:
  912. raise ValueError("Cannot specify ignore_existing when using apply.")
  913. if dry_run:
  914. raise ValueError("Cannot specify dry_run when using apply.")
  915. sync_plan = _load_plan(apply)
  916. status = StatusLine(enabled=not quiet)
  917. if not quiet:
  918. _print_plan_summary(sync_plan)
  919. print("Executing plan...")
  920. if quiet:
  921. disable_progress_bars()
  922. try:
  923. _execute_plan(sync_plan, api, verbose=verbose, status=status)
  924. finally:
  925. if quiet:
  926. enable_progress_bars()
  927. if not quiet:
  928. print("Sync completed.")
  929. return sync_plan
  930. # --- Normal mode ---
  931. if not source or not dest:
  932. raise ValueError("Both source and dest are required (unless using apply).")
  933. source_is_bucket = _is_bucket_path(source)
  934. dest_is_bucket = _is_bucket_path(dest)
  935. if source_is_bucket and dest_is_bucket:
  936. raise ValueError("Remote to remote sync is not supported. One path must be local.")
  937. if not source_is_bucket and not dest_is_bucket:
  938. raise ValueError("One of source or dest must be a bucket path (hf://buckets/...).")
  939. if ignore_times and ignore_sizes:
  940. raise ValueError("Cannot specify both ignore_times and ignore_sizes.")
  941. if existing and ignore_existing:
  942. raise ValueError("Cannot specify both existing and ignore_existing.")
  943. if dry_run and plan:
  944. raise ValueError("Cannot specify both dry_run and plan.")
  945. # Validate local path
  946. if source_is_bucket:
  947. if os.path.exists(dest) and not os.path.isdir(dest):
  948. raise ValueError(f"Destination must be a directory: {dest}")
  949. else:
  950. if not os.path.isdir(source):
  951. raise ValueError(f"Source must be an existing directory: {source}")
  952. # Build filter matcher
  953. filter_rules = None
  954. if filter_from:
  955. filter_rules = _parse_filter_file(filter_from)
  956. filter_matcher = FilterMatcher(
  957. include_patterns=include,
  958. exclude_patterns=exclude,
  959. filter_rules=filter_rules,
  960. )
  961. # Compute sync plan
  962. status = StatusLine(enabled=not quiet and not dry_run)
  963. sync_plan = _compute_sync_plan(
  964. source=source,
  965. dest=dest,
  966. api=api,
  967. delete=delete,
  968. ignore_times=ignore_times,
  969. ignore_sizes=ignore_sizes,
  970. existing=existing,
  971. ignore_existing=ignore_existing,
  972. filter_matcher=filter_matcher,
  973. status=status,
  974. )
  975. if dry_run:
  976. _write_plan(sync_plan, sys.stdout)
  977. return sync_plan
  978. if plan:
  979. _save_plan(sync_plan, plan)
  980. if not quiet:
  981. _print_plan_summary(sync_plan)
  982. print(f"Plan saved to: {plan}")
  983. return sync_plan
  984. # Execute plan
  985. if not quiet:
  986. _print_plan_summary(sync_plan)
  987. summary = sync_plan.summary()
  988. if summary["uploads"] == 0 and summary["downloads"] == 0 and summary["deletes"] == 0:
  989. if not quiet:
  990. print("Nothing to sync.")
  991. return sync_plan
  992. if not quiet:
  993. print("Syncing...")
  994. if quiet:
  995. disable_progress_bars()
  996. try:
  997. _execute_plan(sync_plan, api, verbose=verbose, status=status)
  998. finally:
  999. if quiet:
  1000. enable_progress_bars()
  1001. if not quiet:
  1002. print("Sync completed.")
  1003. return sync_plan