Skip to content

Commit 78f86ea

Browse files
committed
implement retry mechanism for HEAD call
1 parent 781a5ac commit 78f86ea

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

docs/source/en/package_reference/environment_variables.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ For more details, see [logging reference](../package_reference/utilities#hugging
7373

7474
### HF_HUB_ETAG_TIMEOUT
7575

76-
Integer value to define the number of seconds to wait for server response when fetching the latest metadata from a repo before downloading a file. If the request times out, `huggingface_hub` will default to the locally cached files. Setting a lower value speeds up the workflow for machines with a slow connection that have already cached files. A higher value guarantees the metadata call to succeed in more cases. Default to 10s.
76+
Integer value to define the initial number of seconds to wait for server response when fetching the latest metadata from a repo before downloading a file. If the request times out, `huggingface_hub` will default to the locally cached files. If no cached file is found, a retry is attempted with a longer timeout (see `HF_HUB_ETAG_TIMEOUT_RETRY`). Setting a lower value speeds up the workflow for machines with a slow connection that have already cached files. Default to 10s.
77+
78+
### HF_HUB_ETAG_TIMEOUT_RETRY
79+
80+
Integer value to define the number of seconds to wait when retrying metadata fetch after an initial timeout. When the initial metadata request times out and no local cached file is found, `huggingface_hub` will retry with this longer timeout before failing. This helps handle transient network slowdowns while keeping the initial timeout fast for cached files. Default to 60s.
7781

7882
### HF_HUB_DOWNLOAD_TIMEOUT
7983

src/huggingface_hub/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def _as_int(value: Optional[str]) -> Optional[int]:
3232
CONFIG_NAME = "config.json"
3333
REPOCARD_NAME = "README.md"
3434
DEFAULT_ETAG_TIMEOUT = 10
35+
DEFAULT_ETAG_TIMEOUT_RETRY = 60
3536
DEFAULT_DOWNLOAD_TIMEOUT = 10
3637
DEFAULT_REQUEST_TIMEOUT = 10
3738
DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
@@ -230,6 +231,9 @@ def _as_int(value: Optional[str]) -> Optional[int]:
230231
# Used to override the etag timeout on a system level
231232
HF_HUB_ETAG_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT")) or DEFAULT_ETAG_TIMEOUT
232233

234+
# Used to override the etag retry timeout on a system level (for retrying after initial timeout when no local file)
235+
HF_HUB_ETAG_TIMEOUT_RETRY: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT_RETRY")) or DEFAULT_ETAG_TIMEOUT_RETRY
236+
233237
# Used to override the get request timeout on a system level
234238
HF_HUB_DOWNLOAD_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_DOWNLOAD_TIMEOUT")) or DEFAULT_DOWNLOAD_TIMEOUT
235239

src/huggingface_hub/file_download.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,8 +1131,28 @@ def _hf_hub_download_to_cache_dir(
11311131
if not force_download:
11321132
return pointer_path
11331133

1134-
# Otherwise, raise appropriate error
1135-
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
1134+
# No local file found, retry with longer timeout if it was a timeout error
1135+
if isinstance(head_call_error, httpx.TimeoutException):
1136+
logger.info("Metadata fetch timed out and no local file found. Retrying with longer timeout..")
1137+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
1138+
_get_metadata_or_catch_error(
1139+
repo_id=repo_id,
1140+
filename=filename,
1141+
repo_type=repo_type,
1142+
revision=revision,
1143+
endpoint=endpoint,
1144+
etag_timeout=constants.HF_HUB_ETAG_TIMEOUT_RETRY,
1145+
headers=headers,
1146+
token=token,
1147+
local_files_only=local_files_only,
1148+
storage_folder=storage_folder,
1149+
relative_filename=relative_filename,
1150+
)
1151+
)
1152+
1153+
# If still error, raise
1154+
if head_call_error is not None:
1155+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
11361156

11371157
# From now on, etag, commit_hash, url and size are not None.
11381158
assert etag is not None, "etag must have been retrieved from server"
@@ -1300,9 +1320,26 @@ def _hf_hub_download_to_local_dir(
13001320
)
13011321
if not force_download:
13021322
return local_path
1323+
elif not force_download and isinstance(head_call_error, httpx.TimeoutException):
1324+
# No local file found, retry with longer timeout if it was a timeout error
1325+
logger.info("Metadata fetch timed out and no local file found. Retrying with longer timeout...")
1326+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
1327+
_get_metadata_or_catch_error(
1328+
repo_id=repo_id,
1329+
filename=filename,
1330+
repo_type=repo_type,
1331+
revision=revision,
1332+
endpoint=endpoint,
1333+
etag_timeout=constants.HF_HUB_ETAG_TIMEOUT_RETRY,
1334+
headers=headers,
1335+
token=token,
1336+
local_files_only=local_files_only,
1337+
)
1338+
)
13031339

1304-
# Otherwise => raise
1305-
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
1340+
# If still error, raise
1341+
if head_call_error is not None:
1342+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
13061343

13071344
# From now on, etag, commit_hash, url and size are not None.
13081345
assert etag is not None, "etag must have been retrieved from server"

0 commit comments

Comments
 (0)