@@ -177,16 +177,27 @@ def log(msg, level="info"):
177177
178178 # Parse with fallback encodings
179179 try :
180- main_soup = BeautifulSoup (raw , features = "lxml" , from_encoding = encFmt )
181- except Exception as e1 :
182- log ("[WARN] Parsing with {} failed: {}" .format (encFmt , e1 ))
180+ # First attempt: use lxml if available, else let BS pick
183181 try :
184- main_soup = BeautifulSoup (raw , features = "lxml" , from_encoding = "utf-8" )
185- log ("[INFO] Parsed successfully with UTF-8 fallback." )
186- except Exception as e2 :
187- log ("[ERROR] UTF-8 parse also failed: {}" .format (e2 ))
188- main_soup = BeautifulSoup (raw , features = "lxml" , from_encoding = encFmt )
189- log ("[INFO] Retrying parse with original encoding fallback." )
182+ import lxml # noqa
183+ parser = "lxml"
184+ except ImportError :
185+ parser = "html.parser"
186+
187+ main_soup = BeautifulSoup (raw , features = parser )
188+
189+ except Exception :
190+ try :
191+ # Fallback to UTF-8
192+ main_soup = BeautifulSoup (
193+ raw .encode ("utf-8" , "replace" ),
194+ features = parser
195+ )
196+ except Exception :
197+ # Final fallback: use whatever parser is available, no feature string
198+ main_soup = BeautifulSoup (
199+ raw .encode (encFmt , "replace" )
200+ )
190201
191202 if len (main_soup .find_all ('table' )) < 1 :
192203 raise LookupError ("No <table> elements found in {}" .format (current_file ))
@@ -250,20 +261,27 @@ def log(msg, level="info"):
250261
251262 soup = None
252263 try :
253- soup = BeautifulSoup (raw , features = "lxml" , from_encoding = encFmt )
254- except Exception as e1 :
255- log ("[WARN] Parsing with {} failed: {}" .format (encFmt , e1 ))
264+ # First attempt: use lxml if available, else let BS pick
256265 try :
257- soup = BeautifulSoup (raw , features = "lxml" , from_encoding = "utf-8" )
258- log ("[INFO] Parsed successfully with UTF-8 fallback." )
259- except Exception as e2 :
260- log ("[ERROR] UTF-8 parse failed: {}" .format (e2 ))
261- try :
262- soup = BeautifulSoup (raw , features = "lxml" , from_encoding = encFmt )
263- log ("[INFO] Retried parse with {} encoding." .format (encFmt ))
264- except Exception as e3 :
265- log ("[FATAL] Could not parse report '{}': {}" .format (file , e3 ))
266- continue
266+ import lxml # noqa
267+ parser = "lxml"
268+ except ImportError :
269+ parser = "html.parser"
270+
271+ main_soup = BeautifulSoup (raw , features = parser )
272+
273+ except Exception :
274+ try :
275+ # Fallback to UTF-8
276+ main_soup = BeautifulSoup (
277+ raw .encode ("utf-8" , "replace" ),
278+ features = parser
279+ )
280+ except Exception :
281+ # Final fallback: use whatever parser is available, no feature string
282+ main_soup = BeautifulSoup (
283+ raw .encode (encFmt , "replace" )
284+ )
267285
268286 try :
269287 if soup .find (id = "report-title" ):
0 commit comments