From a3e5a67aa432fd140320b0e7588ecbad6c5e8202 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 24 Oct 2025 17:02:59 +0200 Subject: [PATCH 01/19] checksum verification for cached repos --- docs/source/en/guides/cli.md | 38 ++++ docs/source/en/guides/manage-cache.md | 22 +++ docs/source/en/package_reference/cli.md | 32 ++++ src/huggingface_hub/cli/cache.py | 100 +++++++++- src/huggingface_hub/hf_api.py | 81 ++++++++ src/huggingface_hub/utils/_verification.py | 204 +++++++++++++++++++++ tests/test_cli.py | 120 +++++++++++- tests/test_hf_api.py | 24 +++ tests/test_verification.py | 113 ++++++++++++ 9 files changed, 732 insertions(+), 2 deletions(-) create mode 100644 src/huggingface_hub/utils/_verification.py create mode 100644 tests/test_verification.py diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index e8625c9d25..5ff4dd85c4 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -673,6 +673,44 @@ Deleted 3 unreferenced revision(s); freed 2.4G. As with the other cache commands, `--dry-run`, `--yes`, and `--cache-dir` are available. Refer to the [Manage your cache](./manage-cache) guide for more examples. +## hf cache verify + +Use `hf cache verify` to validate local files against their checksums on the Hub. Target a single repo per invocation and choose between verifying the cache snapshot or a regular local directory. + +Examples: + +```bash +# Verify main revision of a model in cache +>>> hf cache verify deepseek-ai/DeepSeek-OCR + +# Verify a specific revision +>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision refs/pr/1 +>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision abcdef123 + +# Verify a private repo +>>> hf cache verify me/private-model --token hf_*** + +# Verify a dataset +>>> hf cache verify karpathy/fineweb-edu-100b-shuffle --repo-type dataset + +# Verify files in a local directory +>>> hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo +``` + +By default, the command warns about missing or extra files but does not fail. Use flags to make these conditions fail the command: + +```bash +>>> hf cache verify gpt2 --fail-on-missing-files --fail-on-extra-files +``` + +On success, you will see a summary: + +```text +✅ Verified 60 file(s) at e7da7f221d5bf496a48136c0cd264e630fe9fcc8; no checksum mismatches. +``` + +If mismatches are detected, the command prints a detailed list and exits with a non-zero status. + ## hf repo tag create The `hf repo tag create` command allows you to tag, untag, and list tags for repositories. diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index 67a58c4427..9222275d38 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -479,6 +479,28 @@ HFCacheInfo( ) ``` +### Verify your cache + +`huggingface_hub` can verify that your cached files match the checksums on the Hub. Use `hf cache verify` from the CLI to validate one or more cached repositories or specific revisions. + +Verify a whole cached repository by repo ID (verifies every cached revision for that repo): + +```bash +>>> hf cache verify model/sentence-transformers/all-MiniLM-L6-v2 +✅ Verified 28 file(s) across 1 revision(s); no checksum mismatches detected. +``` + +Verify specific cached revisions by hash (you can pass several targets at once): + +```text +➜ hf cache verify 1c610f6b3f5e7d8a d4ec9b72 +❌ Checksum verification failed for the following file(s): + - dataset/nyu-mll/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c::cola/test-00000-of-00001.parquet: missing locally. +``` + +> [!TIP] +> Pair `hf cache verify` with `--cache-dir PATH` when working outside the default cache, and `--token` to verify against private or gated repositories. + ### Clean your cache Scanning your cache is interesting but what you really want to do next is usually to diff --git a/docs/source/en/package_reference/cli.md b/docs/source/en/package_reference/cli.md index 83d639ed11..ba22bce8bb 100644 --- a/docs/source/en/package_reference/cli.md +++ b/docs/source/en/package_reference/cli.md @@ -152,6 +152,7 @@ $ hf cache [OPTIONS] COMMAND [ARGS]... * `ls`: List cached repositories or revisions. * `prune`: Remove detached revisions from the cache. * `rm`: Remove cached repositories or revisions. +* `verify`: Verify checksums for a single repo... ### `hf cache ls` @@ -210,6 +211,37 @@ $ hf cache rm [OPTIONS] TARGETS... * `--dry-run / --no-dry-run`: Preview deletions without removing anything. [default: no-dry-run] * `--help`: Show this message and exit. +### `hf cache verify` + +Verify checksums for a single repo revision from cache or a local directory. + +Examples: + - Verify main revision in cache: `hf cache verify gpt2` + - Verify specific revision: `hf cache verify gpt2 --revision refs/pr/1` + - Verify dataset: `hf cache verify karpathy/fineweb-edu-100b-shuffle --repo-type dataset` + - Verify local dir: `hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo` + +**Usage**: + +```console +$ hf cache verify [OPTIONS] REPO_ID +``` + +**Arguments**: + +* `REPO_ID`: The ID of the repo (e.g. `username/repo-name`). [required] + +**Options**: + +* `--repo-type [model|dataset|space]`: The type of repository (model, dataset, or space). [default: model] +* `--revision TEXT`: Git revision id which can be a branch name, a tag, or a commit hash. +* `--cache-dir TEXT`: Cache directory to use when verifying files from cache (defaults to Hugging Face cache). +* `--local-dir TEXT`: If set, verify files under this directory instead of the cache. +* `--fail-on-missing-files / --no-fail-on-missing-files`: Fail if some files exist on the remote but are missing locally. [default: no-fail-on-missing-files] +* `--fail-on-extra-files / --no-fail-on-extra-files`: Fail if some files exist locally but are not present on the remote revision. [default: no-fail-on-extra-files] +* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens. +* `--help`: Show this message and exit. + ## `hf download` Download files from the Hub. diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index 939c991833..0ed5241dda 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -37,7 +37,7 @@ tabulate, ) from ..utils._parsing import parse_duration, parse_size -from ._cli_utils import typer_factory +from ._cli_utils import RepoIdArg, RepoTypeOpt, RevisionOpt, TokenOpt, get_hf_api, typer_factory cache_cli = typer_factory(help="Manage local cache directory.") @@ -634,3 +634,101 @@ def prune( strategy.execute() print(f"Deleted {counts.total_revision_count} unreferenced revision(s); freed {strategy.expected_freed_size_str}.") + + +@cache_cli.command() +def verify( + repo_id: RepoIdArg, + repo_type: RepoTypeOpt = RepoTypeOpt.model, + revision: RevisionOpt = None, + cache_dir: Annotated[ + Optional[str], + typer.Option( + help="Cache directory to use when verifying files from cache (defaults to Hugging Face cache).", + ), + ] = None, + local_dir: Annotated[ + Optional[str], + typer.Option( + help="If set, verify files under this directory instead of the cache.", + ), + ] = None, + fail_on_missing_files: Annotated[ + bool, + typer.Option( + help="Fail if some files exist on the remote but are missing locally.", + ), + ] = False, + fail_on_extra_files: Annotated[ + bool, + typer.Option( + help="Fail if some files exist locally but are not present on the remote revision.", + ), + ] = False, + token: TokenOpt = None, +) -> None: + """Verify checksums for a single repo revision from cache or a local directory. + + Examples: + - Verify main revision in cache: `hf cache verify gpt2` + - Verify specific revision: `hf cache verify gpt2 --revision refs/pr/1` + - Verify dataset: `hf cache verify karpathy/fineweb-edu-100b-shuffle --repo-type dataset` + - Verify local dir: `hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo` + """ + + if local_dir is not None and cache_dir is not None: + print("Cannot pass both --local-dir and --cache-dir. Use one or the other.") + raise typer.Exit(code=2) + + api = get_hf_api(token=token) + + try: + result = api.verify_repo_checksums( + repo_id=repo_id, + repo_type=repo_type.value if hasattr(repo_type, "value") else str(repo_type), + revision=revision, + local_dir=local_dir, + cache_dir=cache_dir, + token=token, + ) + except ValueError as exc: + print(str(exc)) + raise typer.Exit(code=1) + + # Print mismatches first if any + if result.mismatches: + print("❌ Checksum verification failed for the following file(s):") + for m in result.mismatches: + print(f" - {m['path']}: expected {m['expected']} ({m['algorithm']}), got {m['actual']}") + + # Handle missing/extra + exit_code = 0 + if result.missing_paths: + if fail_on_missing_files: + print("Missing files (present remotely, absent locally):") + for p in result.missing_paths: + print(f" - {p}") + exit_code = 1 + else: + print( + f"{len(result.missing_paths)} remote file(s) are missing locally. Use --fail-on-missing-files for details." + ) + + if result.extra_paths: + if fail_on_extra_files: + print("Extra files (present locally, absent remotely):") + for p in result.extra_paths: + print(f" - {p}") + exit_code = 1 + else: + print( + f"{len(result.extra_paths)} local file(s) do not exist on remote repo. Use --fail-on-extra-files for more details." + ) + + if result.mismatches: + exit_code = 1 + + if exit_code != 0: + raise typer.Exit(code=exit_code) + + print(f"✅ Verified {result.checked_count} file(s) at {result.revision}; no checksum mismatches.") diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 17b55ce3a5..d601c720d9 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -105,11 +105,13 @@ from .utils._auth import _get_token_from_environment, _get_token_from_file, _get_token_from_google_colab from .utils._deprecation import _deprecate_arguments from .utils._typing import CallableT +from .utils._verification import collect_local_files, resolve_local_root, verify_maps from .utils.endpoint_helpers import _is_emission_within_threshold if TYPE_CHECKING: from .inference._providers import PROVIDER_T + from .utils._verification import Verification R = TypeVar("R") # Return type CollectionItemType_T = Literal["model", "dataset", "space", "paper", "collection"] @@ -3080,6 +3082,84 @@ def list_repo_tree( for path_info in paginate(path=tree_url, headers=headers, params={"recursive": recursive, "expand": expand}): yield (RepoFile(**path_info) if path_info["type"] == "file" else RepoFolder(**path_info)) + @validate_hf_hub_args + def verify_repo_checksums( + self, + repo_id: str, + *, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + local_dir: Optional[Union[str, Path]] = None, + cache_dir: Optional[Union[str, Path]] = None, + token: Union[str, bool, None] = None, + ) -> "Verification": + """ + Verify local files for a repo against Hub checksums. + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated by a `/`. + repo_type (`str`, *optional*): + The type of the repository from which to get the tree (`"model"`, `"dataset"` or `"space"`. + Defaults to `"model"`. + revision (`str`, *optional*): + The revision of the repository from which to get the tree. Defaults to `"main"` branch. + local_dir (`str` or `Path`, *optional*): + The local directory to verify. + cache_dir (`str` or `Path`, *optional*): + The cache directory to verify. + token (Union[bool, str, None], optional): + A valid user access token (string). Defaults to the locally saved + token, which is the recommended method for authentication (see + https://huggingface.co/docs/huggingface_hub/quick-start#authentication). + To disable authentication, pass `False`. + + Returns: + [`Verification`]: a structured result containing the verification details. + + Raises: + [`~utils.RepositoryNotFoundError`]: + If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo + does not exist. + [`~utils.RevisionNotFoundError`]: + If revision is not found (error 404) on the repo. + [`~utils.RemoteEntryNotFoundError`]: + If the tree (folder) does not exist (error 404) on the repo. + + """ + + if repo_type is None: + repo_type = constants.REPO_TYPE_MODEL + + if local_dir is not None and cache_dir is not None: + raise ValueError("Pass either `local_dir` or `cache_dir`, not both.") + + root, remote_revision = resolve_local_root( + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + cache_dir=Path(cache_dir) if cache_dir is not None else None, + local_dir=Path(local_dir) if local_dir is not None else None, + ) + local_by_path = collect_local_files(root) + + # get remote entries + remote_by_path: dict[str, object] = {} + for entry in self.list_repo_tree( + repo_id=repo_id, recursive=True, revision=remote_revision, repo_type=repo_type, token=token + ): + path = getattr(entry, "path", None) + if not path: + continue + lfs = getattr(entry, "lfs", None) + has_lfs_sha = (getattr(lfs, "sha256", None) is not None) or ( + isinstance(lfs, dict) and lfs.get("sha256") is not None + ) + if hasattr(entry, "blob_id") or has_lfs_sha: + remote_by_path[path] = entry + + return verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision) + @validate_hf_hub_args def list_repo_refs( self, @@ -10733,6 +10813,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str: list_repo_commits = api.list_repo_commits list_repo_tree = api.list_repo_tree get_paths_info = api.get_paths_info +verify_repo_checksums = api.verify_repo_checksums get_model_tags = api.get_model_tags get_dataset_tags = api.get_dataset_tags diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py new file mode 100644 index 0000000000..9745b335eb --- /dev/null +++ b/src/huggingface_hub/utils/_verification.py @@ -0,0 +1,204 @@ +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Literal, Optional, TypedDict + +from .. import constants +from ..file_download import repo_folder_name +from .sha import git_hash, sha_fileobj + + +# using fullmatch for clarity and strictness +_REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") + + +# Typed structure describing a checksum mismatch +class Mismatch(TypedDict): + path: str + expected: str + actual: str + algorithm: str + + +HashAlgo = Literal["sha256", "git-sha1"] + + +@dataclass(frozen=True) +class Verification: + revision: str + checked_count: int + mismatches: list[Mismatch] + missing_paths: list[str] + extra_paths: list[str] + + @property + def ok(self) -> bool: + return not (self.mismatches or self.missing_paths or self.extra_paths) + + +def _collect_files_from_directory(root: Path) -> dict[str, Path]: + """ + Return a mapping of repo-relative path -> absolute path for all files under `root`. + """ + return {p.relative_to(root).as_posix(): p for p in root.rglob("*") if p.is_file()} + + +def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str]) -> str: + """ + Resolve a commit hash from a cache repo folder and an optional revision. + """ + if revision and _REGEX_COMMIT_HASH.fullmatch(revision): + return revision + + refs_dir = storage_folder / "refs" + snapshots_dir = storage_folder / "snapshots" + + if revision: + ref_path = refs_dir / revision + if ref_path.is_file(): + return ref_path.read_text(encoding="utf-8").strip() + raise ValueError(f"Revision '{revision}' could not be resolved in cache (expected file '{ref_path}').") + + # No revision provided: try common defaults + main_ref = refs_dir / "main" + if main_ref.is_file(): + return main_ref.read_text(encoding="utf-8").strip() + + if not snapshots_dir.is_dir(): + raise ValueError(f"Cache repo is missing snapshots directory: {snapshots_dir}. Provide --revision explicitly.") + + candidates = [p.name for p in snapshots_dir.iterdir() if p.is_dir() and _REGEX_COMMIT_HASH.fullmatch(p.name)] + if len(candidates) == 1: + return candidates[0] + + raise ValueError( + "Ambiguous cached revision: multiple snapshots found and no refs to disambiguate. Please pass --revision." + ) + + +def resolve_expected_hash(entry: object) -> tuple[HashAlgo, str]: + """ + Return the algorithm and expected hash for a remote entry. + Prefers LFS sha256 if available; falls back to git blob_id (sha1). + """ + lfs = getattr(entry, "lfs", None) + lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None + if lfs_sha is None and isinstance(lfs, dict): + lfs_sha = lfs.get("sha256") + + if lfs_sha: + return ("sha256", str(lfs_sha).lower()) + + blob_id = getattr(entry, "blob_id", None) + if blob_id: + return ("git-sha1", str(blob_id).lower()) + + raise ValueError("Remote entry missing checksum (no blob_id or lfs.sha256)") + + +def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[Path, str]) -> str: + """ + Compute the checksum of a local file using the requested algorithm. + """ + + def _sha256(p: Path, cache: dict[Path, str]) -> str: + with p.open("rb") as stream: + return sha_fileobj(stream).hex() + + def _git_sha1(p: Path, cache: dict[Path, str]) -> str: + try: + return cache[p] + except KeyError: + with p.open("rb") as stream: + data = stream.read() + digest = git_hash(data) + cache[p] = digest + return digest + + HASHERS: dict[HashAlgo, Callable[[Path, dict[Path, str]], str]] = { + "sha256": _sha256, + "git-sha1": _git_sha1, + } + + try: + return HASHERS[algorithm](path, git_hash_cache) + except KeyError: + # Should be unreachable, but keeps type checker happy + raise ValueError(f"Unsupported hash algorithm: {algorithm}") + + +def verify_maps(*, remote_by_path: dict[str, object], local_by_path: dict[str, Path], revision: str) -> Verification: + """Compare remote entries and local files and return a verification result.""" + remote_paths = set(remote_by_path) + local_paths = set(local_by_path) + + missing = sorted(remote_paths - local_paths) + extra = sorted(local_paths - remote_paths) + both = sorted(remote_paths & local_paths) + + mismatches: list[Mismatch] = [] + git_hash_cache: dict[Path, str] = {} + + for rel_path in both: + entry = remote_by_path[rel_path] + local_path = local_by_path[rel_path] + + try: + algorithm, expected = resolve_expected_hash(entry) + actual = compute_file_hash(local_path, algorithm, git_hash_cache=git_hash_cache) + except OSError as exc: + mismatches.append( + Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") + ) + continue + except ValueError as exc: + mismatches.append( + Mismatch(path=rel_path, expected="", actual=f"meta-error:{exc}", algorithm="meta") + ) + continue + + if actual != expected: + mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) + + return Verification( + revision=revision, + checked_count=len(both), + mismatches=mismatches, + missing_paths=missing, + extra_paths=extra, + ) + + +def resolve_local_root( + *, + repo_id: str, + repo_type: str, + revision: Optional[str], + cache_dir: Optional[Path], + local_dir: Optional[Path], +) -> tuple[Path, str]: + """ + Resolve the root directory to scan locally and the remote revision to verify. + """ + if local_dir is not None: + root = Path(local_dir).expanduser().resolve() + if not root.is_dir(): + raise ValueError(f"Local directory does not exist or is not a directory: {root}") + return root, (revision or constants.DEFAULT_REVISION) + + cache_root = Path(cache_dir or constants.HF_HUB_CACHE).expanduser().resolve() + storage_folder = cache_root / repo_folder_name(repo_id=repo_id, repo_type=repo_type) + if not storage_folder.exists(): + raise ValueError( + f"Repo is not present in cache: {storage_folder}. Use 'hf download' first or pass --local-dir." + ) + commit = _resolve_commit_hash_from_cache(storage_folder, revision) + snapshot_dir = storage_folder / "snapshots" / commit + if not snapshot_dir.is_dir(): + raise ValueError(f"Snapshot directory does not exist for revision '{commit}': {snapshot_dir}.") + return snapshot_dir, commit + + +def collect_local_files(root: Path) -> dict[str, Path]: + """Collect all files under a root directory (either a cache snapshot or a regular folder).""" + return _collect_files_from_directory(root) diff --git a/tests/test_cli.py b/tests/test_cli.py index e3bec53930..0a6073457d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,6 +3,7 @@ import warnings from contextlib import contextmanager from pathlib import Path +from types import SimpleNamespace from typing import Generator, Optional from unittest.mock import Mock, patch @@ -16,7 +17,14 @@ from huggingface_hub.cli.hf import app from huggingface_hub.cli.upload import _resolve_upload_paths, upload from huggingface_hub.errors import RevisionNotFoundError -from huggingface_hub.utils import CachedRepoInfo, CachedRevisionInfo, SoftTemporaryDirectory +from huggingface_hub.utils import ( + CachedFileInfo, + CachedRepoInfo, + CachedRevisionInfo, + HFCacheInfo, + SoftTemporaryDirectory, +) +from huggingface_hub.utils._verification import Verification from .testing_utils import DUMMY_MODEL_ID @@ -207,6 +215,116 @@ def test_prune_dry_run(self, runner: CliRunner) -> None: strategy.execute.assert_not_called() print_mock.assert_called_once() + def test_verify_success(self, runner: CliRunner) -> None: + repo_id = "user/model" + result_obj = Verification(revision="main", checked_count=1, mismatches=[], missing_paths=[], extra_paths=[]) + + with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: + api = get_api_mock.return_value + api.verify_repo_checksums.return_value = result_obj + result = runner.invoke(app, ["cache", "verify", repo_id]) + + assert result.exit_code == 0 + assert "Verified 1 file(s)" in result.stdout + get_api_mock.assert_called_once() + api.verify_repo_checksums.assert_called_once_with( + repo_id=repo_id, + repo_type="model", + revision=None, + cache_dir=None, + local_dir=None, + token=None, + ) + + def test_verify_reports_mismatch(self, runner: CliRunner) -> None: + repo_id = "user/model" + result_obj = Verification( + revision="main", + checked_count=1, + mismatches=[{"path": "pytorch_model.bin", "expected": "dead", "actual": "beef", "algorithm": "sha256"}], + missing_paths=[], + extra_paths=[], + ) + + with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: + api = get_api_mock.return_value + api.verify_repo_checksums.return_value = result_obj + result = runner.invoke(app, ["cache", "verify", repo_id]) + + assert result.exit_code == 1 + assert "Checksum verification failed" in result.stdout + assert "pytorch_model.bin" in result.stdout + assert "expected" in result.stdout + + def test_verify_reports_missing_local_file(self, runner: CliRunner) -> None: + commit_hash = "4" * 40 + repo_id = "user/model" + file_name = "config.json" + + with SoftTemporaryDirectory() as tmp_dir: + base = Path(tmp_dir) + snapshot_path = base / "snapshots" / commit_hash + snapshot_path.mkdir(parents=True) + + blob_dir = base / "blobs" + blob_dir.mkdir() + + blob_path = blob_dir / ("a" * 64) + blob_path.write_bytes(b"hello") + + file_path = snapshot_path / file_name + file_path.touch() + + file_info = CachedFileInfo( + file_name=file_name, + file_path=file_path, + blob_path=blob_path, + size_on_disk=blob_path.stat().st_size, + blob_last_accessed=0.0, + blob_last_modified=0.0, + ) + revision = CachedRevisionInfo( + commit_hash=commit_hash, + snapshot_path=snapshot_path, + size_on_disk=blob_path.stat().st_size, + files=frozenset({file_info}), + refs=frozenset({"main"}), + last_modified=0.0, + ) + repo = CachedRepoInfo( + repo_id=repo_id, + repo_type="model", + repo_path=base, + size_on_disk=blob_path.stat().st_size, + nb_files=1, + revisions=frozenset({revision}), + last_accessed=0.0, + last_modified=0.0, + ) + hf_cache_info = HFCacheInfo( + size_on_disk=blob_path.stat().st_size, + repos=frozenset({repo}), + warnings=[], + ) + + with ( + patch("huggingface_hub.cli.cache.scan_cache_dir", return_value=hf_cache_info), + patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock, + ): + api = get_api_mock.return_value + api.list_repo_tree.return_value = [ + SimpleNamespace(path=file_name, blob_id="unused", lfs=None), + SimpleNamespace( + path="missing.txt", + blob_id="blobid", + lfs=None, + ), + ] + result = runner.invoke(app, ["cache", "verify", repo.cache_id]) + + assert result.exit_code == 1 + assert "missing locally" in result.stdout + class TestUploadCommand: def test_upload_basic(self, runner: CliRunner) -> None: diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index e700dde3b0..745451a330 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -4602,3 +4602,27 @@ def test_create_inference_endpoint_custom_image_payload( assert "model" in payload and "image" in payload["model"] assert payload["model"]["image"] == expected_image_payload + + +class HfApiVerifyChecksumsTest(HfApiCommonTest): + def test_verify_repo_checksums_with_local_cache(self) -> None: + repo_id = self._api.create_repo(repo_name()).repo_id + self._api.create_commit( + repo_id=repo_id, + commit_message="add file", + operations=[CommitOperationAdd(path_or_fileobj=b"data", path_in_repo="file.txt")], + ) + + # minimal cache layout + info = self._api.repo_info(repo_id) + commit = info.sha + parts = [f"{constants.REPO_TYPE_MODEL}s", *repo_id.split("/")] + repo_folder_name = constants.REPO_ID_SEPARATOR.join(parts) + + storage = Path(constants.HF_HUB_CACHE) / repo_folder_name + snapshot = storage / "snapshots" / commit + snapshot.mkdir(parents=True, exist_ok=True) + (snapshot / "file.txt").write_bytes(b"data") + + res = self._api.verify_repo_checksums(repo_id=repo_id, revision=commit, cache_dir=storage.parent) + assert res.revision == commit and res.checked_count == 1 and not res.mismatches diff --git a/tests/test_verification.py b/tests/test_verification.py new file mode 100644 index 0000000000..dcd1f0b2ba --- /dev/null +++ b/tests/test_verification.py @@ -0,0 +1,113 @@ +import hashlib +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch + +from huggingface_hub.hf_api import HfApi +from huggingface_hub.utils._verification import ( + collect_local_files, + compute_file_hash, + resolve_expected_hash, + resolve_local_root, + verify_maps, +) +from huggingface_hub.utils.sha import git_hash + + +def _write(p: Path, data: bytes) -> None: + p.parent.mkdir(parents=True, exist_ok=True) + p.write_bytes(data) + + +def test_collect_local_files_lists_all(tmp_path: Path) -> None: + base = tmp_path + (_ := base / "a" / "b.txt").parent.mkdir(parents=True, exist_ok=True) + (base / "a" / "b.txt").write_text("x") + (base / "c.bin").write_bytes(b"y") + + mapping = collect_local_files(base) + assert mapping["a/b.txt"].read_text() == "x" + assert mapping["c.bin"].read_bytes() == b"y" + + +def test_resolve_local_root_cache_single_snapshot(tmp_path: Path) -> None: + cache_dir = tmp_path + storage = cache_dir / "models--user--model" + (storage / "blobs").mkdir(parents=True) + commit = "a" * 40 + snapshot = storage / "snapshots" / commit + snapshot.mkdir(parents=True) + _write(snapshot / "config.json", b"{}") + _write(snapshot / "nested" / "file.txt", b"hello") + + root, resolved_revision = resolve_local_root( + repo_id="user/model", repo_type="model", revision=commit, cache_dir=cache_dir, local_dir=None + ) + assert resolved_revision == commit + mapping = collect_local_files(root) + assert sorted(mapping.keys()) == ["config.json", "nested/file.txt"] + + +def test_resolve_expected_hash_prefers_lfs_sha256() -> None: + entry = SimpleNamespace(path="x", blob_id="deadbeef", lfs={"sha256": "cafebabe"}) + algo, expected = resolve_expected_hash(entry) + assert algo == "sha256" and expected == "cafebabe" + + +def test_compute_file_hash_git_sha1_stream(tmp_path: Path) -> None: + data = b"content-xyz" + p = tmp_path / "f.bin" + _write(p, data) + # expected git-sha1 (with header) + expected = git_hash(data) + actual = compute_file_hash(p, "git-sha1", git_hash_cache={}) + assert actual == expected + + +def test_verify_maps_success_local_dir(tmp_path: Path) -> None: + # local + loc = tmp_path / "loc" + loc.mkdir() + _write(loc / "a.txt", b"aa") + _write(loc / "b.txt", b"bb") + local_by_path = collect_local_files(loc) + # remote entries (non-LFS for a.txt; LFS for b.txt) + remote_by_path = { + "a.txt": SimpleNamespace(path="a.txt", blob_id=git_hash(b"aa"), lfs=None), + "b.txt": SimpleNamespace(path="b.txt", blob_id="unused", lfs={"sha256": hashlib.sha256(b"bb").hexdigest()}), + } + res = verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision="abc") + assert res.checked_count == 2 and not res.mismatches and not res.missing_paths and not res.extra_paths + + +def test_verify_maps_reports_mismatch(tmp_path: Path) -> None: + loc = tmp_path / "loc2" + loc.mkdir() + _write(loc / "a.txt", b"wrong") + local_by_path = collect_local_files(loc) + remote_by_path = {"a.txt": SimpleNamespace(path="a.txt", blob_id=git_hash(b"right"), lfs=None)} + res = verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision="r") + assert len(res.mismatches) == 1 + m = res.mismatches[0] + assert m["path"] == "a.txt" and m["algorithm"] == "git-sha1" + + +def test_api_verify_repo_checksums_cache_mode(tmp_path: Path) -> None: + # minimal dummy cache structure + cache_dir = tmp_path + commit = "b" * 40 + storage = cache_dir / "models--user--model" + snapshot = storage / "snapshots" / commit + snapshot.mkdir(parents=True) + content = b"hello-world" + _write(snapshot / "file.txt", content) + + with patch.object( + HfApi, + "list_repo_tree", + return_value=[SimpleNamespace(path="file.txt", blob_id=git_hash(content), lfs=None)], + ): + res = HfApi().verify_repo_checksums( + repo_id="user/model", repo_type="model", revision=commit, cache_dir=cache_dir, token=None + ) + assert res.revision == commit and res.checked_count == 1 and not res.mismatches From b01494823aa91fc8ffa8afad758ade806b8589a9 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 24 Oct 2025 17:30:41 +0200 Subject: [PATCH 02/19] better docs --- docs/source/en/guides/cli.md | 2 +- docs/source/en/guides/manage-cache.md | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 5ff4dd85c4..34c452e507 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -700,7 +700,7 @@ Examples: By default, the command warns about missing or extra files but does not fail. Use flags to make these conditions fail the command: ```bash ->>> hf cache verify gpt2 --fail-on-missing-files --fail-on-extra-files +>>> hf cache verify deepseek-ai/DeepSeek-OCR --fail-on-missing-files --fail-on-extra-files ``` On success, you will see a summary: diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index 9222275d38..12e201ae1f 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -486,16 +486,14 @@ HFCacheInfo( Verify a whole cached repository by repo ID (verifies every cached revision for that repo): ```bash ->>> hf cache verify model/sentence-transformers/all-MiniLM-L6-v2 -✅ Verified 28 file(s) across 1 revision(s); no checksum mismatches detected. +>>> hf cache verify meta-llama/Llama-3.2-1B-Instruct +✅ Verified 13 file(s) at 9213176726f574b556790deb65791e0c5aa438b6; no checksum mismatches. ``` -Verify specific cached revisions by hash (you can pass several targets at once): +Verify a specific cached revision: -```text -➜ hf cache verify 1c610f6b3f5e7d8a d4ec9b72 -❌ Checksum verification failed for the following file(s): - - dataset/nyu-mll/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c::cola/test-00000-of-00001.parquet: missing locally. +```bash +>>> hf cache verify meta-llama/Llama-3.2-1B-InstructR --revision abcdef123 ``` > [!TIP] From f1fa8d33c24a3630b81cb090ce5aa56d22f5aac9 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Mon, 27 Oct 2025 13:51:50 +0100 Subject: [PATCH 03/19] review suggestions --- docs/source/en/guides/manage-cache.md | 2 +- src/huggingface_hub/cli/cache.py | 20 ++++++++------------ src/huggingface_hub/hf_api.py | 2 +- src/huggingface_hub/utils/_verification.py | 21 +++++++-------------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index 12e201ae1f..bb49aec41a 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -497,7 +497,7 @@ Verify a specific cached revision: ``` > [!TIP] -> Pair `hf cache verify` with `--cache-dir PATH` when working outside the default cache, and `--token` to verify against private or gated repositories. +> Check the [`hf cache verify` CLI reference](../package_reference/cli#hf-cache-verify) for more details about the usage and a complete list of options. ### Clean your cache diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index 0ed5241dda..a938fc0c59 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -682,18 +682,14 @@ def verify( api = get_hf_api(token=token) - try: - result = api.verify_repo_checksums( - repo_id=repo_id, - repo_type=repo_type.value if hasattr(repo_type, "value") else str(repo_type), - revision=revision, - local_dir=local_dir, - cache_dir=cache_dir, - token=token, - ) - except ValueError as exc: - print(str(exc)) - raise typer.Exit(code=1) + result = api.verify_repo_checksums( + repo_id=repo_id, + repo_type=repo_type.value if hasattr(repo_type, "value") else str(repo_type), + revision=revision, + local_dir=local_dir, + cache_dir=cache_dir, + token=token, + ) # Print mismatches first if any if result.mismatches: diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index d601c720d9..561656497c 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -3144,7 +3144,7 @@ def verify_repo_checksums( local_by_path = collect_local_files(root) # get remote entries - remote_by_path: dict[str, object] = {} + remote_by_path: dict[str, Union[RepoFile, RepoFolder]] = {} for entry in self.list_repo_tree( repo_id=repo_id, recursive=True, revision=remote_revision, repo_type=repo_type, token=token ): diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index 9745b335eb..49ddac79f4 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import Callable, Literal, Optional, TypedDict +from typing import Any, Callable, Literal, Optional, TypedDict from .. import constants from ..file_download import repo_folder_name @@ -24,19 +24,15 @@ class Mismatch(TypedDict): @dataclass(frozen=True) -class Verification: +class FolderVerification: revision: str checked_count: int mismatches: list[Mismatch] missing_paths: list[str] extra_paths: list[str] - @property - def ok(self) -> bool: - return not (self.mismatches or self.missing_paths or self.extra_paths) - -def _collect_files_from_directory(root: Path) -> dict[str, Path]: +def collect_local_files(root: Path) -> dict[str, Path]: """ Return a mapping of repo-relative path -> absolute path for all files under `root`. """ @@ -127,7 +123,9 @@ def _git_sha1(p: Path, cache: dict[Path, str]) -> str: raise ValueError(f"Unsupported hash algorithm: {algorithm}") -def verify_maps(*, remote_by_path: dict[str, object], local_by_path: dict[str, Path], revision: str) -> Verification: +def verify_maps( + *, remote_by_path: dict[str, Any], local_by_path: dict[str, Path], revision: str +) -> FolderVerification: """Compare remote entries and local files and return a verification result.""" remote_paths = set(remote_by_path) local_paths = set(local_by_path) @@ -160,7 +158,7 @@ def verify_maps(*, remote_by_path: dict[str, object], local_by_path: dict[str, P if actual != expected: mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) - return Verification( + return FolderVerification( revision=revision, checked_count=len(both), mismatches=mismatches, @@ -197,8 +195,3 @@ def resolve_local_root( if not snapshot_dir.is_dir(): raise ValueError(f"Snapshot directory does not exist for revision '{commit}': {snapshot_dir}.") return snapshot_dir, commit - - -def collect_local_files(root: Path) -> dict[str, Path]: - """Collect all files under a root directory (either a cache snapshot or a regular folder).""" - return _collect_files_from_directory(root) From f275b9366a4ed55ce9915e3767932d47e80dc8cd Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Mon, 27 Oct 2025 14:21:12 +0100 Subject: [PATCH 04/19] small refacto --- src/huggingface_hub/hf_api.py | 6 ++-- src/huggingface_hub/utils/_verification.py | 35 +++++++++------------- tests/test_cli.py | 8 +++-- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 561656497c..e212db0257 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -111,7 +111,7 @@ if TYPE_CHECKING: from .inference._providers import PROVIDER_T - from .utils._verification import Verification + from .utils._verification import FolderVerification R = TypeVar("R") # Return type CollectionItemType_T = Literal["model", "dataset", "space", "paper", "collection"] @@ -3092,7 +3092,7 @@ def verify_repo_checksums( local_dir: Optional[Union[str, Path]] = None, cache_dir: Optional[Union[str, Path]] = None, token: Union[str, bool, None] = None, - ) -> "Verification": + ) -> "FolderVerification": """ Verify local files for a repo against Hub checksums. @@ -3115,7 +3115,7 @@ def verify_repo_checksums( To disable authentication, pass `False`. Returns: - [`Verification`]: a structured result containing the verification details. + [`FolderVerification`]: a structured result containing the verification details. Raises: [`~utils.RepositoryNotFoundError`]: diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index 49ddac79f4..a9e3fded87 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -1,13 +1,16 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Literal, Optional, TypedDict +from typing import TYPE_CHECKING, Literal, Optional, TypedDict, Union from .. import constants from ..file_download import repo_folder_name from .sha import git_hash, sha_fileobj +if TYPE_CHECKING: + from ..hf_api import RepoFile, RepoFolder + # using fullmatch for clarity and strictness _REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") @@ -72,7 +75,7 @@ def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str ) -def resolve_expected_hash(entry: object) -> tuple[HashAlgo, str]: +def resolve_expected_hash(entry: Union["RepoFile", "RepoFolder"]) -> tuple[HashAlgo, str]: """ Return the algorithm and expected hash for a remote entry. Prefers LFS sha256 if available; falls back to git blob_id (sha1). @@ -97,34 +100,24 @@ def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[P Compute the checksum of a local file using the requested algorithm. """ - def _sha256(p: Path, cache: dict[Path, str]) -> str: - with p.open("rb") as stream: + if algorithm == "sha256": + with path.open("rb") as stream: return sha_fileobj(stream).hex() - def _git_sha1(p: Path, cache: dict[Path, str]) -> str: + if algorithm == "git-sha1": try: - return cache[p] + return git_hash_cache[path] except KeyError: - with p.open("rb") as stream: - data = stream.read() - digest = git_hash(data) - cache[p] = digest + with path.open("rb") as stream: + digest = git_hash(stream.read()) + git_hash_cache[path] = digest return digest - HASHERS: dict[HashAlgo, Callable[[Path, dict[Path, str]], str]] = { - "sha256": _sha256, - "git-sha1": _git_sha1, - } - - try: - return HASHERS[algorithm](path, git_hash_cache) - except KeyError: - # Should be unreachable, but keeps type checker happy - raise ValueError(f"Unsupported hash algorithm: {algorithm}") + raise ValueError(f"Unsupported hash algorithm: {algorithm}") def verify_maps( - *, remote_by_path: dict[str, Any], local_by_path: dict[str, Path], revision: str + *, remote_by_path: dict[str, Union["RepoFile", "RepoFolder"]], local_by_path: dict[str, Path], revision: str ) -> FolderVerification: """Compare remote entries and local files and return a verification result.""" remote_paths = set(remote_by_path) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0a6073457d..fa5691160c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -24,7 +24,7 @@ HFCacheInfo, SoftTemporaryDirectory, ) -from huggingface_hub.utils._verification import Verification +from huggingface_hub.utils._verification import FolderVerification from .testing_utils import DUMMY_MODEL_ID @@ -217,7 +217,9 @@ def test_prune_dry_run(self, runner: CliRunner) -> None: def test_verify_success(self, runner: CliRunner) -> None: repo_id = "user/model" - result_obj = Verification(revision="main", checked_count=1, mismatches=[], missing_paths=[], extra_paths=[]) + result_obj = FolderVerification( + revision="main", checked_count=1, mismatches=[], missing_paths=[], extra_paths=[] + ) with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: api = get_api_mock.return_value @@ -238,7 +240,7 @@ def test_verify_success(self, runner: CliRunner) -> None: def test_verify_reports_mismatch(self, runner: CliRunner) -> None: repo_id = "user/model" - result_obj = Verification( + result_obj = FolderVerification( revision="main", checked_count=1, mismatches=[{"path": "pytorch_model.bin", "expected": "dead", "actual": "beef", "algorithm": "sha256"}], From 73a4607a14c598f5ab2fbeb7bb4e26fcfb9baac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?c=C3=A9lina?= Date: Wed, 29 Oct 2025 14:42:19 +0100 Subject: [PATCH 05/19] Apply suggestions from code review Co-authored-by: Lucain --- docs/source/en/guides/cli.md | 8 ++++---- docs/source/en/guides/manage-cache.md | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 34c452e507..9b01369995 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -675,7 +675,7 @@ As with the other cache commands, `--dry-run`, `--yes`, and `--cache-dir` are av ## hf cache verify -Use `hf cache verify` to validate local files against their checksums on the Hub. Target a single repo per invocation and choose between verifying the cache snapshot or a regular local directory. +Use `hf cache verify` to validate local files against their checksums on the Hub. You can verify either a cache snapshot or a regular local directory. Examples: @@ -684,8 +684,8 @@ Examples: >>> hf cache verify deepseek-ai/DeepSeek-OCR # Verify a specific revision ->>> hf cache verify deepseek-ai/DeepSeek-OCR --revision refs/pr/1 ->>> hf cache verify deepseek-ai/DeepSeek-OCR --revision abcdef123 +>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision refs/pr/5 +>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision ef93bf4a377c5d5ed9dca78e0bc4ea50b26fe6a4 # Verify a private repo >>> hf cache verify me/private-model --token hf_*** @@ -697,7 +697,7 @@ Examples: >>> hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo ``` -By default, the command warns about missing or extra files but does not fail. Use flags to make these conditions fail the command: +By default, the command warns about missing or extra files. Use flags to turn these warnings into errors: ```bash >>> hf cache verify deepseek-ai/DeepSeek-OCR --fail-on-missing-files --fail-on-extra-files diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index bb49aec41a..7d8dc03c2b 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -481,9 +481,8 @@ HFCacheInfo( ### Verify your cache -`huggingface_hub` can verify that your cached files match the checksums on the Hub. Use `hf cache verify` from the CLI to validate one or more cached repositories or specific revisions. +`huggingface_hub` can verify that your cached files match the checksums on the Hub. Use `hf cache verify` CLI to validate file consistency for a specific revision of a specific repository: -Verify a whole cached repository by repo ID (verifies every cached revision for that repo): ```bash >>> hf cache verify meta-llama/Llama-3.2-1B-Instruct @@ -493,7 +492,7 @@ Verify a whole cached repository by repo ID (verifies every cached revision for Verify a specific cached revision: ```bash ->>> hf cache verify meta-llama/Llama-3.2-1B-InstructR --revision abcdef123 +>>> hf cache verify meta-llama/Llama-3.1-8B-Instruct --revision 0e9e39f249a16976918f6564b8830bc894c89659 ``` > [!TIP] From 7b2ea16a5b74f5abfb9753d0ec882e4ccd842d81 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 14:55:13 +0100 Subject: [PATCH 06/19] update boolean options --- src/huggingface_hub/cli/cache.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index a938fc0c59..ce642f22d3 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -656,12 +656,14 @@ def verify( fail_on_missing_files: Annotated[ bool, typer.Option( + "--fail-on-missing-files", help="Fail if some files exist on the remote but are missing locally.", ), ] = False, fail_on_extra_files: Annotated[ bool, typer.Option( + "--fail-on-extra-files", help="Fail if some files exist locally but are not present on the remote revision.", ), ] = False, From eba1cc7016023e161a4ce95f83a99e2bdf0e74f3 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 14:57:16 +0100 Subject: [PATCH 07/19] update docstring --- docs/source/en/package_reference/cli.md | 4 ++-- src/huggingface_hub/hf_api.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/en/package_reference/cli.md b/docs/source/en/package_reference/cli.md index ba22bce8bb..870a3eacfe 100644 --- a/docs/source/en/package_reference/cli.md +++ b/docs/source/en/package_reference/cli.md @@ -237,8 +237,8 @@ $ hf cache verify [OPTIONS] REPO_ID * `--revision TEXT`: Git revision id which can be a branch name, a tag, or a commit hash. * `--cache-dir TEXT`: Cache directory to use when verifying files from cache (defaults to Hugging Face cache). * `--local-dir TEXT`: If set, verify files under this directory instead of the cache. -* `--fail-on-missing-files / --no-fail-on-missing-files`: Fail if some files exist on the remote but are missing locally. [default: no-fail-on-missing-files] -* `--fail-on-extra-files / --no-fail-on-extra-files`: Fail if some files exist locally but are not present on the remote revision. [default: no-fail-on-extra-files] +* `--fail-on-missing-files`: Fail if some files exist on the remote but are missing locally. +* `--fail-on-extra-files`: Fail if some files exist locally but are not present on the remote revision. * `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens. * `--help`: Show this message and exit. diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index e212db0257..d264623872 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -3123,8 +3123,6 @@ def verify_repo_checksums( does not exist. [`~utils.RevisionNotFoundError`]: If revision is not found (error 404) on the repo. - [`~utils.RemoteEntryNotFoundError`]: - If the tree (folder) does not exist (error 404) on the repo. """ From 8d6c1f2ddcbccde9ca945da3291aa7c5d57fb0a9 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 15:58:28 +0100 Subject: [PATCH 08/19] remove helper --- src/huggingface_hub/utils/_verification.py | 23 +++++++++++++++++---- tests/test_verification.py | 24 +--------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index a9e3fded87..f39f558408 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -131,12 +131,27 @@ def verify_maps( git_hash_cache: dict[Path, str] = {} for rel_path in both: - entry = remote_by_path[rel_path] + remote_entry = remote_by_path[rel_path] local_path = local_by_path[rel_path] try: - algorithm, expected = resolve_expected_hash(entry) + lfs = getattr(remote_entry, "lfs", None) + lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None + if lfs_sha is None and isinstance(lfs, dict): + lfs_sha = lfs.get("sha256") + + if lfs_sha: + algorithm: HashAlgo = "sha256" + expected = str(lfs_sha).lower() + else: + blob_id = getattr(remote_entry, "blob_id", None) + if not blob_id: + raise ValueError("Remote entry missing checksum (no blob_id or lfs.sha256)") + algorithm = "git-sha1" + expected = str(blob_id).lower() + actual = compute_file_hash(local_path, algorithm, git_hash_cache=git_hash_cache) + except OSError as exc: mismatches.append( Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") @@ -148,8 +163,8 @@ def verify_maps( ) continue - if actual != expected: - mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) + if actual != expected: + mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) return FolderVerification( revision=revision, diff --git a/tests/test_verification.py b/tests/test_verification.py index dcd1f0b2ba..240c703a7d 100644 --- a/tests/test_verification.py +++ b/tests/test_verification.py @@ -4,13 +4,7 @@ from unittest.mock import patch from huggingface_hub.hf_api import HfApi -from huggingface_hub.utils._verification import ( - collect_local_files, - compute_file_hash, - resolve_expected_hash, - resolve_local_root, - verify_maps, -) +from huggingface_hub.utils._verification import collect_local_files, resolve_local_root, verify_maps from huggingface_hub.utils.sha import git_hash @@ -48,22 +42,6 @@ def test_resolve_local_root_cache_single_snapshot(tmp_path: Path) -> None: assert sorted(mapping.keys()) == ["config.json", "nested/file.txt"] -def test_resolve_expected_hash_prefers_lfs_sha256() -> None: - entry = SimpleNamespace(path="x", blob_id="deadbeef", lfs={"sha256": "cafebabe"}) - algo, expected = resolve_expected_hash(entry) - assert algo == "sha256" and expected == "cafebabe" - - -def test_compute_file_hash_git_sha1_stream(tmp_path: Path) -> None: - data = b"content-xyz" - p = tmp_path / "f.bin" - _write(p, data) - # expected git-sha1 (with header) - expected = git_hash(data) - actual = compute_file_hash(p, "git-sha1", git_hash_cache={}) - assert actual == expected - - def test_verify_maps_success_local_dir(tmp_path: Path) -> None: # local loc = tmp_path / "loc" From cd158f206d3e0ae337e526b11053d107d3e480d6 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 16:36:43 +0100 Subject: [PATCH 09/19] another refactor --- src/huggingface_hub/__init__.py | 2 ++ src/huggingface_hub/hf_api.py | 12 ++---------- src/huggingface_hub/utils/_verification.py | 18 +++--------------- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 54f968ef60..0ffdd40bd8 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -288,6 +288,7 @@ "upload_file", "upload_folder", "upload_large_folder", + "verify_repo_checksums", "whoami", ], "hf_file_system": [ @@ -1302,6 +1303,7 @@ def __dir__(): upload_file, # noqa: F401 upload_folder, # noqa: F401 upload_large_folder, # noqa: F401 + verify_repo_checksums, # noqa: F401 whoami, # noqa: F401 ) from .hf_file_system import ( diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index d264623872..df75da3f91 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -598,7 +598,7 @@ class RepoFile: The file's size, in bytes. blob_id (`str`): The file's git OID. - lfs (`BlobLfsInfo`): + lfs (`BlobLfsInfo`, *optional*): The file's LFS metadata. last_commit (`LastCommitInfo`, *optional*): The file's last commit metadata. Only defined if [`list_repo_tree`] and [`get_paths_info`] @@ -3146,15 +3146,7 @@ def verify_repo_checksums( for entry in self.list_repo_tree( repo_id=repo_id, recursive=True, revision=remote_revision, repo_type=repo_type, token=token ): - path = getattr(entry, "path", None) - if not path: - continue - lfs = getattr(entry, "lfs", None) - has_lfs_sha = (getattr(lfs, "sha256", None) is not None) or ( - isinstance(lfs, dict) and lfs.get("sha256") is not None - ) - if hasattr(entry, "blob_id") or has_lfs_sha: - remote_by_path[path] = entry + remote_by_path[entry.path] = entry return verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision) diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index f39f558408..d211f08064 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -87,12 +87,8 @@ def resolve_expected_hash(entry: Union["RepoFile", "RepoFolder"]) -> tuple[HashA if lfs_sha: return ("sha256", str(lfs_sha).lower()) - - blob_id = getattr(entry, "blob_id", None) - if blob_id: - return ("git-sha1", str(blob_id).lower()) - - raise ValueError("Remote entry missing checksum (no blob_id or lfs.sha256)") + blob_id = entry.blob_id # type: ignore + return ("git-sha1", str(blob_id).lower()) def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[Path, str]) -> str: @@ -139,14 +135,11 @@ def verify_maps( lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None if lfs_sha is None and isinstance(lfs, dict): lfs_sha = lfs.get("sha256") - if lfs_sha: algorithm: HashAlgo = "sha256" expected = str(lfs_sha).lower() else: - blob_id = getattr(remote_entry, "blob_id", None) - if not blob_id: - raise ValueError("Remote entry missing checksum (no blob_id or lfs.sha256)") + blob_id = remote_entry.blob_id # type: ignore algorithm = "git-sha1" expected = str(blob_id).lower() @@ -157,11 +150,6 @@ def verify_maps( Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") ) continue - except ValueError as exc: - mismatches.append( - Mismatch(path=rel_path, expected="", actual=f"meta-error:{exc}", algorithm="meta") - ) - continue if actual != expected: mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) From 3a16a655c1c311f105d4cef017ac13b9951561b5 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 16:48:07 +0100 Subject: [PATCH 10/19] better --- src/huggingface_hub/__init__.py | 1 + src/huggingface_hub/utils/_verification.py | 24 ++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 0ffdd40bd8..6abc690f3a 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -969,6 +969,7 @@ "upload_file", "upload_folder", "upload_large_folder", + "verify_repo_checksums", "webhook_endpoint", "whoami", ] diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index d211f08064..edba34a0c9 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -130,21 +130,19 @@ def verify_maps( remote_entry = remote_by_path[rel_path] local_path = local_by_path[rel_path] + lfs = getattr(remote_entry, "lfs", None) + lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None + if lfs_sha is None and isinstance(lfs, dict): + lfs_sha = lfs.get("sha256") + if lfs_sha: + algorithm: HashAlgo = "sha256" + expected = str(lfs_sha).lower() + else: + blob_id = remote_entry.blob_id # type: ignore + algorithm = "git-sha1" + expected = str(blob_id).lower() try: - lfs = getattr(remote_entry, "lfs", None) - lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None - if lfs_sha is None and isinstance(lfs, dict): - lfs_sha = lfs.get("sha256") - if lfs_sha: - algorithm: HashAlgo = "sha256" - expected = str(lfs_sha).lower() - else: - blob_id = remote_entry.blob_id # type: ignore - algorithm = "git-sha1" - expected = str(blob_id).lower() - actual = compute_file_hash(local_path, algorithm, git_hash_cache=git_hash_cache) - except OSError as exc: mismatches.append( Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") From f9b2441502d9adfcb5f19069a40d4caa37210310 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 17:04:20 +0100 Subject: [PATCH 11/19] remove unused helper --- src/huggingface_hub/utils/_verification.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index edba34a0c9..fcd154f5cf 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -75,22 +75,6 @@ def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str ) -def resolve_expected_hash(entry: Union["RepoFile", "RepoFolder"]) -> tuple[HashAlgo, str]: - """ - Return the algorithm and expected hash for a remote entry. - Prefers LFS sha256 if available; falls back to git blob_id (sha1). - """ - lfs = getattr(entry, "lfs", None) - lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None - if lfs_sha is None and isinstance(lfs, dict): - lfs_sha = lfs.get("sha256") - - if lfs_sha: - return ("sha256", str(lfs_sha).lower()) - blob_id = entry.blob_id # type: ignore - return ("git-sha1", str(blob_id).lower()) - - def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[Path, str]) -> str: """ Compute the checksum of a local file using the requested algorithm. From e87e904efceea96e2f51cb2286805c54a6a6713c Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 17:28:00 +0100 Subject: [PATCH 12/19] update tests --- tests/test_verification.py | 61 +++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/tests/test_verification.py b/tests/test_verification.py index 240c703a7d..bd8c8e94ac 100644 --- a/tests/test_verification.py +++ b/tests/test_verification.py @@ -3,8 +3,17 @@ from types import SimpleNamespace from unittest.mock import patch +import pytest + +import huggingface_hub.utils._verification as verification_module from huggingface_hub.hf_api import HfApi -from huggingface_hub.utils._verification import collect_local_files, resolve_local_root, verify_maps +from huggingface_hub.utils._verification import ( + HashAlgo, + collect_local_files, + compute_file_hash, + resolve_local_root, + verify_maps, +) from huggingface_hub.utils.sha import git_hash @@ -24,6 +33,43 @@ def test_collect_local_files_lists_all(tmp_path: Path) -> None: assert mapping["c.bin"].read_bytes() == b"y" +@pytest.mark.parametrize( + "algorithm,data,expected_fn", + [ + ("sha256", b"hello", lambda d: hashlib.sha256(d).hexdigest()), + ("git-sha1", b"hello", lambda d: git_hash(d)), + ], +) +def test_compute_file_hash_algorithms(tmp_path: Path, algorithm: HashAlgo, data: bytes, expected_fn) -> None: + fp = tmp_path / "x.bin" + _write(fp, data) + + cache: dict[Path, str] = {} + actual = compute_file_hash(fp, algorithm, git_hash_cache=cache) + assert actual == expected_fn(data) + + +def test_compute_file_hash_git_sha1_uses_cache(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + fp = tmp_path / "x.txt" + data = b"cached!" + _write(fp, data) + + calls = {"count": 0} + + def fake_git_hash(b: bytes) -> str: + calls["count"] += 1 + return git_hash(b) + + monkeypatch.setattr(verification_module, "git_hash", fake_git_hash, raising=False) + + cache: dict[Path, str] = {} + h1 = compute_file_hash(fp, "git-sha1", git_hash_cache=cache) + h2 = compute_file_hash(fp, "git-sha1", git_hash_cache=cache) + + assert h1 == h2 == git_hash(data) + assert calls["count"] == 1 + + def test_resolve_local_root_cache_single_snapshot(tmp_path: Path) -> None: cache_dir = tmp_path storage = cache_dir / "models--user--model" @@ -45,17 +91,24 @@ def test_resolve_local_root_cache_single_snapshot(tmp_path: Path) -> None: def test_verify_maps_success_local_dir(tmp_path: Path) -> None: # local loc = tmp_path / "loc" - loc.mkdir() _write(loc / "a.txt", b"aa") _write(loc / "b.txt", b"bb") local_by_path = collect_local_files(loc) + # remote entries (non-LFS for a.txt; LFS for b.txt) remote_by_path = { "a.txt": SimpleNamespace(path="a.txt", blob_id=git_hash(b"aa"), lfs=None), - "b.txt": SimpleNamespace(path="b.txt", blob_id="unused", lfs={"sha256": hashlib.sha256(b"bb").hexdigest()}), + "b.txt": SimpleNamespace( + path="b.txt", + blob_id="unused", + lfs={"sha256": hashlib.sha256(b"bb").hexdigest()}, + ), } res = verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision="abc") - assert res.checked_count == 2 and not res.mismatches and not res.missing_paths and not res.extra_paths + assert res.checked_count == 2 + assert res.mismatches == [] + assert res.missing_paths == [] + assert res.extra_paths == [] def test_verify_maps_reports_mismatch(tmp_path: Path) -> None: From f24d97533d560715b7eff4bb649d27ee77bf9ca6 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 18:01:02 +0100 Subject: [PATCH 13/19] add repo id and repo type in cli output --- docs/source/en/guides/cli.md | 3 ++- docs/source/en/guides/manage-cache.md | 3 ++- src/huggingface_hub/cli/cache.py | 29 ++++++++++++++++++---- src/huggingface_hub/hf_api.py | 13 ++++++++-- src/huggingface_hub/utils/_verification.py | 7 ++++-- tests/test_cli.py | 13 ++++++++-- 6 files changed, 55 insertions(+), 13 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 80a3a78cbb..7d8c49ac52 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -706,7 +706,8 @@ By default, the command warns about missing or extra files. Use flags to turn th On success, you will see a summary: ```text -✅ Verified 60 file(s) at e7da7f221d5bf496a48136c0cd264e630fe9fcc8; no checksum mismatches. +✅ Verified 13 file(s) for 'deepseek-ai/DeepSeek-OCR' (model) in ~/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6 + All checksums match. ``` If mismatches are detected, the command prints a detailed list and exits with a non-zero status. diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index 8c6f0302f0..f6e74c9fc1 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -486,7 +486,8 @@ HFCacheInfo( ```bash >>> hf cache verify meta-llama/Llama-3.2-1B-Instruct -✅ Verified 13 file(s) at 9213176726f574b556790deb65791e0c5aa438b6; no checksum mismatches. +✅ Verified 13 file(s) for 'meta-llama/Llama-3.2-1B-Instruct' (model) in ~/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6 + All checksums match. ``` Verify a specific cached revision: diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index ce642f22d3..4009005b55 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -22,6 +22,7 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum +from pathlib import Path from typing import Annotated, Any, Callable, Dict, List, Mapping, Optional, Tuple import typer @@ -708,9 +709,11 @@ def verify( print(f" - {p}") exit_code = 1 else: - print( - f"{len(result.missing_paths)} remote file(s) are missing locally. Use --fail-on-missing-files for details." + warning = ( + f"{len(result.missing_paths)} remote file(s) are missing locally. " + "Use --fail-on-missing-files for details." ) + print(f"⚠️ {warning}") if result.extra_paths: if fail_on_extra_files: @@ -719,9 +722,11 @@ def verify( print(f" - {p}") exit_code = 1 else: - print( - f"{len(result.extra_paths)} local file(s) do not exist on remote repo. Use --fail-on-extra-files for more details." + warning = ( + f"{len(result.extra_paths)} local file(s) do not exist on the remote repo. " + "Use --fail-on-extra-files for details." ) + print(f"⚠️ {warning}") if result.mismatches: exit_code = 1 @@ -729,4 +734,18 @@ def verify( if exit_code != 0: raise typer.Exit(code=exit_code) - print(f"✅ Verified {result.checked_count} file(s) at {result.revision}; no checksum mismatches.") + repo_label = result.repo_id or repo_id + repo_type_label = result.repo_type or (repo_type.value if hasattr(repo_type, "value") else str(repo_type)) + verified_location = result.verified_path + if verified_location is None: + if local_dir is not None: + verified_location = Path(local_dir).expanduser().resolve() + elif cache_dir is not None: + verified_location = Path(cache_dir).expanduser().resolve() + + location_suffix = "" + if verified_location is not None: + location_suffix = f" in {verified_location}" + + print(f"✅ Verified {result.checked_count} file(s) for '{repo_label}' ({repo_type_label}){location_suffix}") + print(" All checksums match.") diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index df75da3f91..71a51a7c8a 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -22,7 +22,7 @@ import warnings from collections import defaultdict from concurrent.futures import Future, ThreadPoolExecutor -from dataclasses import asdict, dataclass, field +from dataclasses import asdict, dataclass, field, replace from datetime import datetime from functools import wraps from itertools import islice @@ -3148,7 +3148,16 @@ def verify_repo_checksums( ): remote_by_path[entry.path] = entry - return verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision) + verification = verify_maps( + remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision + ) + + return replace( + verification, + verified_path=root, + repo_id=repo_id, + repo_type=repo_type, + ) @validate_hf_hub_args def list_repo_refs( diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index fcd154f5cf..d5ea2326d5 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -33,6 +33,9 @@ class FolderVerification: mismatches: list[Mismatch] missing_paths: list[str] extra_paths: list[str] + verified_path: Optional[Path] = None + repo_id: Optional[str] = None + repo_type: Optional[str] = None def collect_local_files(root: Path) -> dict[str, Path]: @@ -133,8 +136,8 @@ def verify_maps( ) continue - if actual != expected: - mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) + if actual != expected: + mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) return FolderVerification( revision=revision, diff --git a/tests/test_cli.py b/tests/test_cli.py index fa5691160c..77e3af11b8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -218,7 +218,14 @@ def test_prune_dry_run(self, runner: CliRunner) -> None: def test_verify_success(self, runner: CliRunner) -> None: repo_id = "user/model" result_obj = FolderVerification( - revision="main", checked_count=1, mismatches=[], missing_paths=[], extra_paths=[] + revision="main", + checked_count=1, + mismatches=[], + missing_paths=[], + extra_paths=[], + verified_path=Path("/tmp/cache/user/model"), + repo_id=repo_id, + repo_type="model", ) with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: @@ -227,7 +234,9 @@ def test_verify_success(self, runner: CliRunner) -> None: result = runner.invoke(app, ["cache", "verify", repo_id]) assert result.exit_code == 0 - assert "Verified 1 file(s)" in result.stdout + stdout = result.stdout + assert "✅ Verified 1 file(s) for 'user/model' (model) in /tmp/cache/user/model" in stdout + assert " All checksums match." in stdout get_api_mock.assert_called_once() api.verify_repo_checksums.assert_called_once_with( repo_id=repo_id, From 84752d9216b2b18068af4bd0d3478017cb01421a Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 18:21:40 +0100 Subject: [PATCH 14/19] better cli output --- src/huggingface_hub/cli/cache.py | 31 +++++++--------------- src/huggingface_hub/hf_api.py | 7 +---- src/huggingface_hub/utils/_verification.py | 2 -- tests/test_cli.py | 6 +++-- 4 files changed, 15 insertions(+), 31 deletions(-) diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index 4009005b55..8a4541e357 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -22,7 +22,6 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum -from pathlib import Path from typing import Annotated, Any, Callable, Dict, List, Mapping, Optional, Tuple import typer @@ -694,14 +693,15 @@ def verify( token=token, ) - # Print mismatches first if any - if result.mismatches: + exit_code = 0 + + has_mismatches = bool(result.mismatches) + if has_mismatches: print("❌ Checksum verification failed for the following file(s):") for m in result.mismatches: print(f" - {m['path']}: expected {m['expected']} ({m['algorithm']}), got {m['actual']}") + exit_code = 1 - # Handle missing/extra - exit_code = 0 if result.missing_paths: if fail_on_missing_files: print("Missing files (present remotely, absent locally):") @@ -728,24 +728,13 @@ def verify( ) print(f"⚠️ {warning}") - if result.mismatches: - exit_code = 1 + verified_location = result.verified_path if exit_code != 0: + location_suffix_error = f" in {verified_location}" if verified_location is not None else "" + print(f"❌ Verification failed for '{repo_id}' ({repo_type.value}){location_suffix_error}.") + print(f" Revision: {result.revision}") raise typer.Exit(code=exit_code) - repo_label = result.repo_id or repo_id - repo_type_label = result.repo_type or (repo_type.value if hasattr(repo_type, "value") else str(repo_type)) - verified_location = result.verified_path - if verified_location is None: - if local_dir is not None: - verified_location = Path(local_dir).expanduser().resolve() - elif cache_dir is not None: - verified_location = Path(cache_dir).expanduser().resolve() - - location_suffix = "" - if verified_location is not None: - location_suffix = f" in {verified_location}" - - print(f"✅ Verified {result.checked_count} file(s) for '{repo_label}' ({repo_type_label}){location_suffix}") + print(f"✅ Verified {result.checked_count} file(s) for '{repo_id}' ({repo_type.value}) in {verified_location}") print(" All checksums match.") diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 71a51a7c8a..168bb8f6fd 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -3152,12 +3152,7 @@ def verify_repo_checksums( remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision ) - return replace( - verification, - verified_path=root, - repo_id=repo_id, - repo_type=repo_type, - ) + return replace(verification, verified_path=root) @validate_hf_hub_args def list_repo_refs( diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index d5ea2326d5..6c64158b48 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -34,8 +34,6 @@ class FolderVerification: missing_paths: list[str] extra_paths: list[str] verified_path: Optional[Path] = None - repo_id: Optional[str] = None - repo_type: Optional[str] = None def collect_local_files(root: Path) -> dict[str, Path]: diff --git a/tests/test_cli.py b/tests/test_cli.py index 77e3af11b8..02f3ccf100 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -224,8 +224,6 @@ def test_verify_success(self, runner: CliRunner) -> None: missing_paths=[], extra_paths=[], verified_path=Path("/tmp/cache/user/model"), - repo_id=repo_id, - repo_type="model", ) with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: @@ -266,6 +264,8 @@ def test_verify_reports_mismatch(self, runner: CliRunner) -> None: assert "Checksum verification failed" in result.stdout assert "pytorch_model.bin" in result.stdout assert "expected" in result.stdout + assert "Verification failed for 'user/model' (model)" in result.stdout + assert "Revision: main" in result.stdout def test_verify_reports_missing_local_file(self, runner: CliRunner) -> None: commit_hash = "4" * 40 @@ -335,6 +335,8 @@ def test_verify_reports_missing_local_file(self, runner: CliRunner) -> None: assert result.exit_code == 1 assert "missing locally" in result.stdout + assert "Verification failed for" in result.stdout + assert "Revision:" in result.stdout class TestUploadCommand: From 3b6f4624c0af367aa048b861fa62fc635fe713c1 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Wed, 29 Oct 2025 18:24:43 +0100 Subject: [PATCH 15/19] fix --- src/huggingface_hub/cli/cache.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/huggingface_hub/cli/cache.py b/src/huggingface_hub/cli/cache.py index 8a4541e357..6fdf910b1f 100644 --- a/src/huggingface_hub/cli/cache.py +++ b/src/huggingface_hub/cli/cache.py @@ -731,8 +731,7 @@ def verify( verified_location = result.verified_path if exit_code != 0: - location_suffix_error = f" in {verified_location}" if verified_location is not None else "" - print(f"❌ Verification failed for '{repo_id}' ({repo_type.value}){location_suffix_error}.") + print(f"❌ Verification failed for '{repo_id}' ({repo_type.value}) in {verified_location}.") print(f" Revision: {result.revision}") raise typer.Exit(code=exit_code) From 4ed6e69d4f9d78dccee0496ac79a7064303b0a47 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 31 Oct 2025 11:16:15 +0100 Subject: [PATCH 16/19] review suggestions --- src/huggingface_hub/hf_api.py | 11 ++++++----- src/huggingface_hub/utils/_verification.py | 23 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 168bb8f6fd..9c1e77c687 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -22,7 +22,7 @@ import warnings from collections import defaultdict from concurrent.futures import Future, ThreadPoolExecutor -from dataclasses import asdict, dataclass, field, replace +from dataclasses import asdict, dataclass, field from datetime import datetime from functools import wraps from itertools import islice @@ -3148,12 +3148,13 @@ def verify_repo_checksums( ): remote_by_path[entry.path] = entry - verification = verify_maps( - remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision + return verify_maps( + remote_by_path=remote_by_path, + local_by_path=local_by_path, + revision=remote_revision, + verified_path=root, ) - return replace(verification, verified_path=root) - @validate_hf_hub_args def list_repo_refs( self, diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index 6c64158b48..ab09510590 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -33,7 +33,7 @@ class FolderVerification: mismatches: list[Mismatch] missing_paths: list[str] extra_paths: list[str] - verified_path: Optional[Path] = None + verified_path: Path def collect_local_files(root: Path) -> dict[str, Path]: @@ -76,7 +76,7 @@ def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str ) -def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[Path, str]) -> str: +def compute_file_hash(path: Path, algorithm: HashAlgo) -> str: """ Compute the checksum of a local file using the requested algorithm. """ @@ -86,19 +86,18 @@ def compute_file_hash(path: Path, algorithm: HashAlgo, *, git_hash_cache: dict[P return sha_fileobj(stream).hex() if algorithm == "git-sha1": - try: - return git_hash_cache[path] - except KeyError: - with path.open("rb") as stream: - digest = git_hash(stream.read()) - git_hash_cache[path] = digest - return digest + with path.open("rb") as stream: + return git_hash(stream.read()) raise ValueError(f"Unsupported hash algorithm: {algorithm}") def verify_maps( - *, remote_by_path: dict[str, Union["RepoFile", "RepoFolder"]], local_by_path: dict[str, Path], revision: str + *, + remote_by_path: dict[str, Union["RepoFile", "RepoFolder"]], + local_by_path: dict[str, Path], + revision: str, + verified_path: Path, ) -> FolderVerification: """Compare remote entries and local files and return a verification result.""" remote_paths = set(remote_by_path) @@ -109,7 +108,6 @@ def verify_maps( both = sorted(remote_paths & local_paths) mismatches: list[Mismatch] = [] - git_hash_cache: dict[Path, str] = {} for rel_path in both: remote_entry = remote_by_path[rel_path] @@ -127,7 +125,7 @@ def verify_maps( algorithm = "git-sha1" expected = str(blob_id).lower() try: - actual = compute_file_hash(local_path, algorithm, git_hash_cache=git_hash_cache) + actual = compute_file_hash(local_path, algorithm) except OSError as exc: mismatches.append( Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") @@ -143,6 +141,7 @@ def verify_maps( mismatches=mismatches, missing_paths=missing, extra_paths=extra, + verified_path=verified_path, ) From 0b761c2cbec90dd54a041d4c83c124a657797730 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 31 Oct 2025 11:35:30 +0100 Subject: [PATCH 17/19] cleaner --- src/huggingface_hub/utils/_verification.py | 11 +++----- tests/test_cli.py | 1 + tests/test_verification.py | 29 +++++++++++++++------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index ab09510590..b68b3a74e8 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -81,15 +81,12 @@ def compute_file_hash(path: Path, algorithm: HashAlgo) -> str: Compute the checksum of a local file using the requested algorithm. """ - if algorithm == "sha256": - with path.open("rb") as stream: + with path.open("rb") as stream: + if algorithm == "sha256": return sha_fileobj(stream).hex() - - if algorithm == "git-sha1": - with path.open("rb") as stream: + if algorithm == "git-sha1": return git_hash(stream.read()) - - raise ValueError(f"Unsupported hash algorithm: {algorithm}") + raise ValueError(f"Unsupported hash algorithm: {algorithm}") def verify_maps( diff --git a/tests/test_cli.py b/tests/test_cli.py index 02f3ccf100..8135d250b5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -253,6 +253,7 @@ def test_verify_reports_mismatch(self, runner: CliRunner) -> None: mismatches=[{"path": "pytorch_model.bin", "expected": "dead", "actual": "beef", "algorithm": "sha256"}], missing_paths=[], extra_paths=[], + verified_path=Path("/tmp/cache/user/model"), ) with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: diff --git a/tests/test_verification.py b/tests/test_verification.py index bd8c8e94ac..6f458e67c4 100644 --- a/tests/test_verification.py +++ b/tests/test_verification.py @@ -44,12 +44,11 @@ def test_compute_file_hash_algorithms(tmp_path: Path, algorithm: HashAlgo, data: fp = tmp_path / "x.bin" _write(fp, data) - cache: dict[Path, str] = {} - actual = compute_file_hash(fp, algorithm, git_hash_cache=cache) + actual = compute_file_hash(fp, algorithm) assert actual == expected_fn(data) -def test_compute_file_hash_git_sha1_uses_cache(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_compute_file_hash_git_sha1_computes_hash(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: fp = tmp_path / "x.txt" data = b"cached!" _write(fp, data) @@ -62,12 +61,12 @@ def fake_git_hash(b: bytes) -> str: monkeypatch.setattr(verification_module, "git_hash", fake_git_hash, raising=False) - cache: dict[Path, str] = {} - h1 = compute_file_hash(fp, "git-sha1", git_hash_cache=cache) - h2 = compute_file_hash(fp, "git-sha1", git_hash_cache=cache) + h1 = compute_file_hash(fp, "git-sha1") + h2 = compute_file_hash(fp, "git-sha1") assert h1 == h2 == git_hash(data) - assert calls["count"] == 1 + # Each call computes the hash independently (no cache) + assert calls["count"] == 2 def test_resolve_local_root_cache_single_snapshot(tmp_path: Path) -> None: @@ -104,11 +103,17 @@ def test_verify_maps_success_local_dir(tmp_path: Path) -> None: lfs={"sha256": hashlib.sha256(b"bb").hexdigest()}, ), } - res = verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision="abc") + res = verify_maps( + remote_by_path=remote_by_path, + local_by_path=local_by_path, + revision="abc", + verified_path=loc, + ) assert res.checked_count == 2 assert res.mismatches == [] assert res.missing_paths == [] assert res.extra_paths == [] + assert res.verified_path == loc def test_verify_maps_reports_mismatch(tmp_path: Path) -> None: @@ -117,10 +122,16 @@ def test_verify_maps_reports_mismatch(tmp_path: Path) -> None: _write(loc / "a.txt", b"wrong") local_by_path = collect_local_files(loc) remote_by_path = {"a.txt": SimpleNamespace(path="a.txt", blob_id=git_hash(b"right"), lfs=None)} - res = verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision="r") + res = verify_maps( + remote_by_path=remote_by_path, + local_by_path=local_by_path, + revision="r", + verified_path=loc, + ) assert len(res.mismatches) == 1 m = res.mismatches[0] assert m["path"] == "a.txt" and m["algorithm"] == "git-sha1" + assert res.verified_path == loc def test_api_verify_repo_checksums_cache_mode(tmp_path: Path) -> None: From a259ad1db8e1d259f182f8d45a3c605256a50b07 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 31 Oct 2025 11:41:11 +0100 Subject: [PATCH 18/19] don't handle os errors --- src/huggingface_hub/utils/_verification.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/huggingface_hub/utils/_verification.py b/src/huggingface_hub/utils/_verification.py index b68b3a74e8..f32a4f4994 100644 --- a/src/huggingface_hub/utils/_verification.py +++ b/src/huggingface_hub/utils/_verification.py @@ -121,13 +121,8 @@ def verify_maps( blob_id = remote_entry.blob_id # type: ignore algorithm = "git-sha1" expected = str(blob_id).lower() - try: - actual = compute_file_hash(local_path, algorithm) - except OSError as exc: - mismatches.append( - Mismatch(path=rel_path, expected="", actual=f"io-error:{exc}", algorithm="io") - ) - continue + + actual = compute_file_hash(local_path, algorithm) if actual != expected: mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm)) From c28bebf02050f902f07e5cecaf0ceacaff99fc09 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Fri, 31 Oct 2025 15:28:20 +0100 Subject: [PATCH 19/19] fix test on windows --- tests/test_cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 8135d250b5..efbeee7bdc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -217,13 +217,14 @@ def test_prune_dry_run(self, runner: CliRunner) -> None: def test_verify_success(self, runner: CliRunner) -> None: repo_id = "user/model" + verified_path = Path("/tmp/cache/user/model") result_obj = FolderVerification( revision="main", checked_count=1, mismatches=[], missing_paths=[], extra_paths=[], - verified_path=Path("/tmp/cache/user/model"), + verified_path=verified_path, ) with patch("huggingface_hub.cli.cache.get_hf_api") as get_api_mock: @@ -233,7 +234,9 @@ def test_verify_success(self, runner: CliRunner) -> None: assert result.exit_code == 0 stdout = result.stdout - assert "✅ Verified 1 file(s) for 'user/model' (model) in /tmp/cache/user/model" in stdout + normalized_stdout = stdout.replace("\\", "/") + expected_path_str = verified_path.as_posix() + assert f"✅ Verified 1 file(s) for 'user/model' (model) in {expected_path_str}" in normalized_stdout assert " All checksums match." in stdout get_api_mock.assert_called_once() api.verify_repo_checksums.assert_called_once_with(