Implement hugging-face cache and expert slice

lshpku · lshpku · commit 8dbaa4f92e84 · 2025-09-17T00:53:22.000Z
diff --git a/paddlenlp/trainer/utils/load_hf_ckpt.py b/paddlenlp/trainer/utils/load_hf_ckpt.py
@@ -277,13 +277,56 @@ def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
         if len(tensor.shape) != 1:
             print("attention same shape not transpose !!!!!!!!!!!!!!!!!!!!!!")
         return tensor
-    if len(tensor.shape) == 2 and paddle.transpose(tensor, perm=[1, 0]).contiguous().shape == dst_shape:
+
+    if len(tensor.shape) == 2:
+        num_experts, hidden_size = tensor.shape
+        assert hidden_size == dst_shape[0], f"Shape not match: {tensor.shape} {dst_shape}"
+        if num_experts != dst_shape[1]:
+            print(f"Slice weight: {tensor.shape} -> {dst_shape}")
+            tensor = tensor[:dst_shape[1]]
         return paddle.transpose(tensor, perm=[1, 0]).contiguous()
 
-    print("shape not match here")
+    if len(tensor.shape) == 1:
+        print(f"Slice weight: {tensor.shape} -> {dst_shape}")
+        tensor = tensor[:dst_shape[0]]
+        return tensor
+
+    print("Fatal: shape not match here:", tensor.shape, dst_shape)
     sys.exit()
 
 
+def hf_cache(path):
+    print('looking up:', path)
+    import os, time, subprocess
+    basename = 'lshrun_' + os.path.basename(path)
+    cache_path = os.path.join('/dev/shm', basename)
+    lock_path = cache_path + '.lock'
+
+    # Case 1: cache exists
+    if os.path.exists(cache_path):
+        print('hit cache:', cache_path)
+        return cache_path
+
+    try:
+        open(lock_path, 'x')
+    except FileExistsError:
+        # Case 2: peer is loading
+        print('waiting peer load:', lock_path)
+        while os.path.exists(lock_path):
+            time.sleep(0.1)
+        print('peer done:', lock_path)
+    else:
+        # Case 3: load it ourself
+        print('copying:', lock_path)
+        while subprocess.run(['cp', path, lock_path]).returncode:
+            print('retrying:', path, '->', lock_path)
+            time.sleep(10)  # sometimes too many open files cause error
+        subprocess.run(['mv', lock_path, cache_path], check=True)
+        print('done copy:', lock_path)
+
+    return cache_path
+
+
 def load_huggingface_ckpt(model, huggingface_ckpt_path):
     ckpt_pre = huggingface_ckpt_path
 
@@ -328,8 +371,9 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
     check_list = []
     print("Start load huggingface ckpt")
     for i, filename in enumerate(required_files):
+        print(f'loading {i + 1}/{len(required_files)}: {filename}')
         try:
-            with safe_open(ckpt_pre + filename, framework="paddle", device="cpu") as f:
+            with safe_open(hf_cache(ckpt_pre + filename), framework="paddle", device="cpu") as f:
                 # 加载该文件包含的所有参数
                 pd_params = file_to_pd_param_name[filename]
                 for pd_param in pd_params:
@@ -359,12 +403,12 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
                             if weight_map[hf_name[0]] == filename:
                                 tensor0 = f.get_tensor(hf_name[0])
                                 with safe_open(
-                                    ckpt_pre + weight_map[hf_name[1]], framework="paddle", device="cpu"
+                                    hf_cache(ckpt_pre + weight_map[hf_name[1]]), framework="paddle", device="cpu"
                                 ) as f_other:
                                     tensor1 = f_other.get_tensor(hf_name[1])
                             else:
                                 with safe_open(
-                                    ckpt_pre + weight_map[hf_name[0]], framework="paddle", device="cpu"
+                                    hf_cache(ckpt_pre + weight_map[hf_name[0]]), framework="paddle", device="cpu"
                                 ) as f_other:
                                     tensor0 = f_other.get_tensor(hf_name[0])
                                 tensor1 = f.get_tensor(hf_name[1])
@@ -376,3 +420,4 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
         except Exception as e:
             print(f"Error loading {filename}: {str(e)}")
             raise
+    print("End load huggingface ckpt")