RegexExtractionStrategy returns empty result #1204
-
IssueRegexExtractionStrategy does not extract data Codeimport asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import RegexExtractionStrategy
async def extract_ethereum_exchange_wallets(url: str):
browser_cfg = BrowserConfig(
headless=False, # Visible for demonstration
verbose=True
)
session_id = "ethereum_wallets"
# 1. Use base_wait to initially wait for dynamic content table id="address-datatable" to be loaded
base_wait = """js:() => {
return document.getElementById('address-datatable') !== null;
}"""
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# Load the initial page
initial_config = CrawlerRunConfig(
wait_for=base_wait,
session_id=session_id,
cache_mode=CacheMode.BYPASS,
)
await crawler.arun(
url=url,
config=initial_config
)
print("Initial page loaded and address-datatable is present.")
js_interact_rows = """
(async function() { // Wrapped in an async immediately-invoked function expression (IIFE)
const addressRows = document.querySelectorAll('tr.address-row');
for (let i = 0; i < addressRows.length; i++) {
const clickEvent = new MouseEvent('click', {
bubbles: true,
cancelable: true,
view: window
});
addressRows[i].dispatchEvent(clickEvent);
// You might want to add a small delay here if clicks trigger new content
await new Promise(resolve => setTimeout(resolve, 100));
}
})();
"""
print("Clicking rows to expand details...")
click_and_wait_config = CrawlerRunConfig(
session_id=session_id,
js_code=js_interact_rows,
scan_full_page=True,
js_only=True, # Continue from the open tab
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(
url=url, # URL doesn't change, but needed by arun
config=click_and_wait_config
)
if not result.success:
print("Failed to click on wallets:", result.error_message)
return
regex_pattern = r'<tr[^>]*?class="address-row"[^>]*?id="([^"]+)"[^>]*?>'
# 2. Create the extraction strategy
extraction_strategy = RegexExtractionStrategy(
patterns=[regex_pattern],
field_names=["address_row_id"], # Changed field name to reflect what's being extracted
verbose=True
)
extraction_config = CrawlerRunConfig(
session_id=session_id,
extraction_strategy=extraction_strategy,
scan_full_page=True,
js_only=True, # Continue from the open tab
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(
url=url, # URL doesn't change, but needed by arun
config=extraction_config
)
if not result.success:
print("Failed to extract wallets:", result.error_message)
return
print(result.extracted_content)
async def main():
await extract_ethereum_exchange_wallets("https://example.com")
if __name__ == "__main__":
asyncio.run(main())Variant with dummy HTML: import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import RegexExtractionStrategy
async def extract_ethereum_exchange_wallets(url: str):
browser_cfg = BrowserConfig(
headless=False, # Visible for demonstration
verbose=True
)
session_id = "ethereum_wallets"
# 1. Use base_wait to initially wait for dynamic content table id="address-datatable" to be loaded
base_wait = """js:() => {
return document.getElementById('address-datatable') !== null;
}"""
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# Load the initial page
initial_config = CrawlerRunConfig(
wait_for=base_wait,
session_id=session_id,
cache_mode=CacheMode.BYPASS,
)
await crawler.arun(
url=url,
config=initial_config
)
print("Initial page loaded and address-datatable is present.")
js_interact_rows = """
(async function() { // Wrapped in an async immediately-invoked function expression (IIFE)
const addressRows = document.querySelectorAll('tr.address-row');
for (let i = 0; i < addressRows.length; i++) {
const clickEvent = new MouseEvent('click', {
bubbles: true,
cancelable: true,
view: window
});
addressRows[i].dispatchEvent(clickEvent);
// You might want to add a small delay here if clicks trigger new content
await new Promise(resolve => setTimeout(resolve, 100));
}
})();
"""
print("Clicking rows to expand details...")
click_and_wait_config = CrawlerRunConfig(
session_id=session_id,
js_code=js_interact_rows,
scan_full_page=True,
js_only=True, # Continue from the open tab
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(
url=url, # URL doesn't change, but needed by arun
config=click_and_wait_config
)
if not result.success:
print("Failed to click on wallets:", result.error_message)
return
dummy_html_content = """
<table>
<tr class="address-row" id="address_A" data-code="CODE_A">
<td>Address A</td>
</tr>
<tr class="chain-expand" id="wallet_A1">
<td>Wallet A1</td>
</tr>
<tr class="chain-expand" some-other-attr="val" id="wallet_A2">
<td>Wallet A2</td>
</tr>
<tr class="address-row" id="address_B" data-code="CODE_B">
<td>Address B</td>
</tr>
<tr class="chain-expand" id="wallet_B1">
<td>Wallet B1</td>
</tr>
<tr class="chain-expand" id="wallet_B2">
<td>Wallet B2</td>
</tr>
<tr class="chain-expand" id="wallet_B3">
<td>Wallet B3</td>
</tr>
<tr class="address-row" id="address_C" data-code="CODE_C">
<td>Address C</td>
</tr>
<tr class="chain-expand" id="wallet_C1">
<td>Wallet C1</td>
</tr>
<tr> <td>Some other row</td>
</tr>
<tr class="chain-expand-different" id="not_this_one"> <td>Different class</td>
</tr>
<tr class="address-row" data-code="CODE_D"> <td>Address D (No ID)</td>
</tr>
</table>
"""
regex_pattern = r'<tr[^>]*?class="address-row"[^>]*?id="([^"]+)"[^>]*?>'
# 2. Create the extraction strategy
extraction_strategy = RegexExtractionStrategy(
patterns=[regex_pattern],
field_names=["address_row_id"], # Changed field name to reflect what's being extracted
verbose=True
)
extraction_config = CrawlerRunConfig(
extraction_strategy=extraction_strategy,
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(
url=url, # URL doesn't change, but needed by arun
html_content=dummy_html_content,
config=extraction_config
)
if not result.success:
print("Failed to extract wallets:", result.error_message)
return
print(result.extracted_content)
async def main():
await extract_ethereum_exchange_wallets("https://example.com")
if __name__ == "__main__":
asyncio.run(main())Actual result:[] Expected result:A dictionary of wallet Ids Module versionName: Crawl4AI |
Beta Was this translation helpful? Give feedback.
Replies: 3 comments
-
|
The url I am trying with the example above is: |
Beta Was this translation helpful? Give feedback.
-
|
hi @rdv0011 , there are 2 small issues with the regex: 1/ It was looking for The fix is to tweak the regex to handle both of those things. I also combined the steps into a single You can try this regex: And you only need to pass the |
Beta Was this translation helpful? Give feedback.
-
|
@ntohidi , Thank you! It started to work. |
Beta Was this translation helpful? Give feedback.
hi @rdv0011 , there are 2 small issues with the regex:
1/ It was looking for
class="address-row"exactly, but the real HTML had other classes, too (like class="address-row even").2/ It was trying to grab an
idattribute from the row, but the rows didn't have one. They had adata-codeattribute instead.The fix is to tweak the regex to handle both of those things. I also combined the steps into a single
arun()call, which is a bit cleaner.You can try this regex:
And you only need to pass the
customparameter inRegexExtractionStrategy, not the other parameters: