11import  logging 
22
33import  psutil 
4- 
4+ import   shlex 
55from  run .runner  import  Runner 
66from  sparrow_schema .schema  import  sparrow 
77
@@ -17,6 +17,7 @@ class Extractor:
1717    sql_extractor  =  "" 
1818    swift_extractor  =  "" 
1919    xml_extractor  =  "" 
20+     arkts_extractor  =  "" 
2021
2122    def  __init__ (self ):
2223        Extractor .cfamily_extractor  =  sparrow .home  /  "language"  /  "cfamily"  /  "extractor"  /  "usr"  /  "bin"  /  "coref-cfamily-src-extractor" 
@@ -28,6 +29,7 @@ def __init__(self):
2829        Extractor .sql_extractor  =  sparrow .home  /  "language"  /  "sql"  /  "extractor"  /  "coref-sql-src-extractor_deploy.jar" 
2930        Extractor .swift_extractor  =  sparrow .home  /  "language"  /  "swift"  /  "extractor"  /  "usr"  /  "bin"  /  "coref-swift-src-extractor" 
3031        Extractor .xml_extractor  =  sparrow .home  /  "language"  /  "xml"  /  "extractor"  /  "coref-xml-extractor_deploy.jar" 
32+         Extractor .arkts_extractor  =  sparrow .home  /  "language"  /  "arkts"  /  "extractor"  /  "coref-arkts-src-extractor" 
3133
3234
3335def  cfamily_extractor_cmd (source_root , database , options ):
@@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):
5860
5961def  java_extractor_cmd (source_root , database , options ):
6062    cmd  =  list ()
61-     cmd  +=  jar_extractor_cmd (Extractor .java_extractor , source_root , database )
63+     cmd  +=  jar_extractor_cmd (Extractor .java_extractor , source_root , database ,  options )
6264    if  options :
65+         if  "white-list"  in  options  and  "whiteList"  in  options :
66+             logging .error ("white-list and whiteList cannot be configured at the same time" )
67+             return  - 1 
68+         if  "cp"  in  options  and  "classpath"  in  options :
69+             logging .error ("cp and classpath cannot be configured at the same time" )
70+             return  - 1 
6371        for  (key , value ) in  options .items ():
6472            if  key  ==  "white-list"  or  key  ==  "whiteList" :
65-                 cmd  +=  ["-w=" , value ]
66-             elif  key  ==  "cp" :
67-                 cmd  +=  ["-cp=" , value ]
68-             elif  key  ==  "classpath" :
69-                 cmd  +=  ["--classpath=" , value ]
73+                 cmd  +=  ["-w="  +  value ]
74+             elif  key  ==  "cp"  or  key  ==  "classpath" :
75+                 cmd  +=  ["-cp="  +  value ]
7076            elif  key  ==  "incremental" :
7177                if  value  ==  "true" :
7278                    cmd  +=  ["--incremental" ]
@@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
8086                    logging .warning ("java.incremental does not take effect, please use java.incremental=true" )
8187            else :
8288                if  key  !=  "cache-dir"  and  key  !=  "commit"  and  key  !=  "remote-cache-type"  and  \
83-                         key  !=  "oss-bucket"  and  key  !=  "oss-config-file"  and  key  !=  "oss-path-prefix" :
84-                     logging .warning ("unsupported config name:%s for java extractor." , key )
89+                         key  !=  "oss-bucket"  and  key  !=  "oss-config-file"  and  key  !=  "oss-path-prefix"  and  \
90+                         key  !=  "jvm_opts" :
91+                     logging .warning ("unsupported config name: %s for java extractor." , key )
8592    if  "incremental"  not  in options  or  options ["incremental" ] !=  "true" :
8693        cmd  +=  ["--parallel" ]
8794    return  cmd 
@@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):
124131
125132
126133def  properties_extractor_cmd (source_root , database , options ):
127-     cmd  =  jar_extractor_cmd (Extractor .properties_extractor , source_root , database )
134+     cmd  =  jar_extractor_cmd (Extractor .properties_extractor , source_root , database ,  options )
128135    return  cmd 
129136
130137
@@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):
136143
137144def  sql_extractor_cmd (source_root , database , options ):
138145    cmd  =  list ()
139-     cmd  +=  jar_extractor_cmd (Extractor .sql_extractor , source_root , database )
146+     cmd  +=  jar_extractor_cmd (Extractor .sql_extractor , source_root , database ,  options )
140147    if  "sql-dialect-type"  in  options :
141148        cmd  +=  ["--sql-dialect-type" , options ["sql-dialect-type" ]]
142149    return  cmd 
143150
144151
145- def  swift_extractor (source_root , database , options ):
152+ def  swift_extractor_cmd (source_root , database , options ):
146153    cmd  =  list ()
147154    cmd  +=  [str (Extractor .swift_extractor ), str (source_root ), str (database )]
148155    if  options :
@@ -156,23 +163,56 @@ def swift_extractor(source_root, database, options):
156163
157164
158165def  xml_extractor_cmd (source_root , database , options ):
159-     cmd  =  jar_extractor_cmd (Extractor .xml_extractor , source_root , database )
166+     cmd  =  jar_extractor_cmd (Extractor .xml_extractor , source_root , database ,  options )
160167    return  cmd 
161168
162169
163- def  jar_extractor_cmd (extractor_path , source_root , database ):
164-     # 获取内存信息 
165-     mem  =  psutil .virtual_memory ()
166-     total_memory  =  mem .total 
167-     pod_memory_limit  =  get_pod_memory_limit ()
168-     if  pod_memory_limit  !=  0 :
169-         total_memory  =  pod_memory_limit 
170-     total_memory_gb  =  round (total_memory  /  (1024  **  3 ))
171-     logging .info ("current memory is : %s GB" , total_memory_gb )
172-     xmx  =  max (total_memory_gb  -  1 , 6 )
173-     logging .info ("final -Xmx is: %s GB" , xmx )
170+ def  arkts_extractor_cmd (source_root , database , options ):
174171    cmd  =  list ()
175-     cmd  +=  ["java" , "-jar" , "-Xmx"  +  str (xmx ) +  "g" , str (extractor_path )]
172+     cmd  +=  [str (Extractor .arkts_extractor ), "extract" ] +  \
173+            ["--extract-text" , "-s" , str (source_root )] +  \
174+            ["-d" , str (database  /  "coref_arkts_src.db" )]
175+     if  options :
176+         for  (key , value ) in  options .items ():
177+             if  key  ==  "blacklist"  or  key  ==  "b" :
178+                 cmd  +=  ["--blacklist" ] +  value .split ("," )
179+             elif  key  ==  "use-gitignore" :
180+                 cmd  +=  ["--use-gitignore" ]
181+             elif  key  ==  "extract-text" :
182+                 cmd  +=  ["--extract-text" ]
183+             elif  key  ==  "extract-deps" :
184+                 cmd  +=  ["--extract-deps" ]
185+             elif  key  ==  "file-size-limit" :
186+                 cmd  +=  ["--file-size-limit" , value ]
187+             elif  key  ==  "paths" :
188+                 cmd  +=  ["--paths" , value ]
189+             else :
190+                 logging .warning ("unsupported config name:%s for arkts extractor." , key )
191+     return  cmd 
192+ 
193+ 
194+ def  jar_extractor_cmd (extractor_path , source_root , database , options ):
195+     jvm_opts  =  None 
196+     if  options :
197+         for  (key , value ) in  options .items ():
198+             if  key  ==  "jvm_opts" :
199+                 # jvm_opts from user specified extract config 
200+                 jvm_opts  =  value 
201+ 
202+     # if no jvm_opts from extract config, calculate xmx according to current memory. 
203+     if  not  jvm_opts :
204+         mem  =  psutil .virtual_memory ()
205+         total_memory  =  mem .total 
206+         total_memory_gb  =  round (total_memory  /  (1024  **  3 ))
207+         total_memory_gb  =  min (total_memory_gb , 32 )  # limit to 32G 
208+         xmx  =  max (total_memory_gb  -  1 , 6 )
209+         logging .info ("current memory is: %s GB, will use xmx: %s GB." , total_memory_gb , xmx )
210+         jvm_opts  =  f"-Xmx{ xmx }  
211+ 
212+     logging .info ("extract jvm_opts is: %s ." , jvm_opts )
213+ 
214+     cmd  =  list ()
215+     cmd  +=  ["java" ] +  shlex .split (jvm_opts ) +  ["-jar" , str (extractor_path )]
176216    cmd  +=  [str (source_root ), str (database )]
177217    return  cmd 
178218
@@ -190,26 +230,6 @@ def extractor_run(language, source_root, database, timeout, options):
190230        tmp  =  Runner (cmd , timeout )
191231        return  tmp .subrun ()
192232    else :
193-         logging .error ("Not supported language: %s " , language )
233+         logging .error ("Failed to obtain the %s extractor " , language )
194234        return  - 1 
195235
196- 
197- def  get_pod_memory_limit ():
198-     # cgroup 文件系统路径 
199-     memory_limit_path  =  "/sys/fs/cgroup/memory/memory.limit_in_bytes" 
200-     memory_limit  =  0 
201-     try :
202-         with  open (memory_limit_path , 'r' ) as  f :
203-             memory_limit  =  int (f .read ().strip ())
204-     except  FileNotFoundError :
205-         pass 
206-     except  PermissionError :
207-         logging .error ("Permission denied when accessing cgroup files." )
208-     except  IOError  as  e :
209-         logging .error (f"IO error occurred when accessing cgroup files: { e }  )
210-     except  Exception  as  e :
211-         logging .error (f"An unexpected error occurred: { e }  )
212-     return  memory_limit 
213- 
214- 
215- 
0 commit comments