@@ -10,17 +10,22 @@ import { existsSync } from "node:fs";
10
10
11
11
/**
12
12
* This script debugs xet uploads by capturing all network data locally
13
- * It takes a local file , repo, and token, then uploads while saving:
13
+ * It takes one or more local files , repo, and token, then uploads while saving:
14
14
* - Dedup shards as dedup_[chunk_hash]_shard.bin
15
15
* - Uploaded xorbs as uploaded_xorb_1.bin, uploaded_xorb_2.bin, etc.
16
16
* - Uploaded shards as uploaded_shard_1.bin, uploaded_shard_2.bin, etc.
17
17
*
18
- * Normal mode: Captures all upload data to upload_[filename]/ directory
18
+ * Normal mode: Captures all upload data to upload_[filename]/ directory (single file) or multiple-files/ directory (multiple files)
19
19
* Replay mode: Validates upload data matches previously captured local files
20
20
*
21
21
* Usage:
22
+ * Single file:
22
23
* pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo>
23
24
* pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo> --replay
25
+ *
26
+ * Multiple files (comma-separated):
27
+ * pnpm --filter hub debug-xet -f <file1,file2,file3> -t <write_token> -r <xet_repo>
28
+ * pnpm --filter hub debug-xet -f <file1,file2,file3> -t <write_token> -r <xet_repo> --replay
24
29
*/
25
30
26
31
interface DebugFetchStats {
@@ -182,32 +187,34 @@ function createDebugFetch(args: { debugDir: string; replay?: boolean }): {
182
187
} ;
183
188
}
184
189
185
- async function * createFileSource ( filepath : string ) : AsyncGenerator < {
190
+ async function * createMultiFileSource ( filepaths : string [ ] ) : AsyncGenerator < {
186
191
content : Blob ;
187
192
path : string ;
188
193
sha256 : string ;
189
194
} > {
190
- const filename = basename ( filepath ) ;
191
- console . log ( `Processing ${ filename } ...` ) ;
192
-
193
- const blob : Blob = await FileBlob . create ( filepath ) ;
194
-
195
- // Calculate sha256
196
- console . log ( `Calculating SHA256 for ${ filename } ...` ) ;
197
- const sha256Iterator = sha256 ( blob , { useWebWorker : false } ) ;
198
- let res : IteratorResult < number , string > ;
199
- do {
200
- res = await sha256Iterator . next ( ) ;
201
- } while ( ! res . done ) ;
202
- const sha256Hash = res . value ;
203
-
204
- console . log ( `SHA256 for ${ filename } : ${ sha256Hash } ` ) ;
205
-
206
- yield {
207
- content : blob ,
208
- path : filename ,
209
- sha256 : sha256Hash ,
210
- } ;
195
+ for ( const filepath of filepaths ) {
196
+ const filename = basename ( filepath ) ;
197
+ console . log ( `Processing ${ filename } ...` ) ;
198
+
199
+ const blob : Blob = await FileBlob . create ( filepath ) ;
200
+
201
+ // Calculate sha256
202
+ console . log ( `Calculating SHA256 for ${ filename } ...` ) ;
203
+ const sha256Iterator = sha256 ( blob , { useWebWorker : false } ) ;
204
+ let res : IteratorResult < number , string > ;
205
+ do {
206
+ res = await sha256Iterator . next ( ) ;
207
+ } while ( ! res . done ) ;
208
+ const sha256Hash = res . value ;
209
+
210
+ console . log ( `SHA256 for ${ filename } : ${ sha256Hash } ` ) ;
211
+
212
+ yield {
213
+ content : blob ,
214
+ path : filename ,
215
+ sha256 : sha256Hash ,
216
+ } ;
217
+ }
211
218
}
212
219
213
220
async function main ( ) {
@@ -233,20 +240,27 @@ async function main() {
233
240
} ) ;
234
241
235
242
if ( ! args . token || ! args . repo || ! args . file ) {
236
- console . error ( "Usage: pnpm --filter hub debug-xet -f <local_file > -t <write_token> -r <xet_repo>" ) ;
243
+ console . error ( "Usage: pnpm --filter hub debug-xet -f <file1,file2,file3 > -t <write_token> -r <xet_repo>" ) ;
237
244
console . error ( "Example: pnpm --filter hub debug-xet -f ./model.bin -t hf_... -r myuser/myrepo" ) ;
245
+ console . error ( "Example: pnpm --filter hub debug-xet -f ./model1.bin,./model2.bin -t hf_... -r myuser/myrepo" ) ;
238
246
console . error ( "Options:" ) ;
239
247
console . error ( " --replay Use local dedup info instead of remote" ) ;
240
248
process . exit ( 1 ) ;
241
249
}
242
250
243
- if ( ! existsSync ( args . file ) ) {
244
- console . error ( `❌ File ${ args . file } does not exist` ) ;
245
- process . exit ( 1 ) ;
251
+ // Parse comma-separated file paths
252
+ const filePaths = args . file . split ( "," ) . map ( ( f ) => f . trim ( ) ) ;
253
+
254
+ // Validate all files exist
255
+ for ( const filePath of filePaths ) {
256
+ if ( ! existsSync ( filePath ) ) {
257
+ console . error ( `❌ File ${ filePath } does not exist` ) ;
258
+ process . exit ( 1 ) ;
259
+ }
246
260
}
247
261
248
- const filename = basename ( args . file ) ;
249
- const debugDir = `upload_${ filename } ` ;
262
+ // Determine debug directory name
263
+ const debugDir = filePaths . length > 1 ? "multiple-files" : `upload_${ basename ( filePaths [ 0 ] ) } ` ;
250
264
251
265
// Handle debug directory based on mode
252
266
if ( args . replay ) {
@@ -288,20 +302,30 @@ async function main() {
288
302
rev : "main" ,
289
303
} ;
290
304
291
- console . log ( `\n=== Starting debug upload for ${ filename } ===` ) ;
305
+ console . log (
306
+ `\n=== Starting debug upload for ${ filePaths . length > 1 ? `${ filePaths . length } files` : basename ( filePaths [ 0 ] ) } ===`
307
+ ) ;
292
308
if ( args . replay ) {
293
309
console . log ( "🔄 Replay mode: Using local dedup info when available" ) ;
294
310
}
295
311
296
- // Get file stats
297
- const fileStats = await stat ( args . file ) ;
298
- console . log ( `📄 File size: ${ ( fileStats . size / 1024 / 1024 ) . toFixed ( 2 ) } MB` ) ;
312
+ // Get total file stats
313
+ let totalSize = 0 ;
314
+ for ( const filePath of filePaths ) {
315
+ const fileStats = await stat ( filePath ) ;
316
+ totalSize += fileStats . size ;
317
+ console . log ( `📄 ${ basename ( filePath ) } : ${ ( fileStats . size / 1_000_000 ) . toFixed ( 2 ) } MB` ) ;
318
+ }
319
+ console . log ( `📊 Total size: ${ ( totalSize / 1_000_000 ) . toFixed ( 2 ) } MB` ) ;
299
320
300
- // Process file through uploadShards
301
- const fileSource = createFileSource ( args . file ) ;
321
+ // Process files through uploadShards
322
+ const fileSource = createMultiFileSource ( filePaths ) ;
302
323
303
- let dedupRatio = 0 ;
304
- let fileSha256 = "" ;
324
+ const processedFiles : Array < {
325
+ path : string ;
326
+ sha256 : string ;
327
+ dedupRatio : number ;
328
+ } > = [ ] ;
305
329
306
330
for await ( const event of uploadShards ( fileSource , uploadParams ) ) {
307
331
switch ( event . event ) {
@@ -310,8 +334,11 @@ async function main() {
310
334
console . log ( ` SHA256: ${ event . sha256 } ` ) ;
311
335
console . log ( ` Dedup ratio: ${ ( event . dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
312
336
313
- dedupRatio = event . dedupRatio ;
314
- fileSha256 = event . sha256 ;
337
+ processedFiles . push ( {
338
+ path : event . path ,
339
+ sha256 : event . sha256 ,
340
+ dedupRatio : event . dedupRatio ,
341
+ } ) ;
315
342
break ;
316
343
}
317
344
@@ -327,9 +354,21 @@ async function main() {
327
354
328
355
console . log ( "\n=== DEBUG UPLOAD RESULTS ===" ) ;
329
356
console . log ( `📁 Debug directory: ${ debugDir } ` ) ;
330
- console . log ( `📄 Original file: ${ filename } (${ ( fileStats . size / 1024 / 1024 ) . toFixed ( 2 ) } MB)` ) ;
331
- console . log ( `🔒 SHA256: ${ fileSha256 } ` ) ;
332
- console . log ( `📊 Deduplication: ${ ( dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
357
+ console . log ( `📄 Processed files: ${ processedFiles . length } ` ) ;
358
+ console . log ( `📊 Total size: ${ ( totalSize / 1024 / 1024 ) . toFixed ( 2 ) } MB` ) ;
359
+
360
+ // Show details for each file
361
+ for ( const file of processedFiles ) {
362
+ console . log ( `\n🔒 ${ file . path } :` ) ;
363
+ console . log ( ` SHA256: ${ file . sha256 } ` ) ;
364
+ console . log ( ` Deduplication: ${ ( file . dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
365
+ }
366
+
367
+ // Calculate average dedup ratio
368
+ const avgDedupRatio =
369
+ processedFiles . length > 0 ? processedFiles . reduce ( ( sum , f ) => sum + f . dedupRatio , 0 ) / processedFiles . length : 0 ;
370
+
371
+ console . log ( `\n📊 Average deduplication: ${ ( avgDedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
333
372
console . log ( `📤 Network calls:` ) ;
334
373
console . log ( ` - ${ stats . xorbCount } xorb uploads` ) ;
335
374
console . log ( ` - ${ stats . shardCount } shard uploads` ) ;
0 commit comments