RegexExtractionStrategy returns empty result #1204

rdv0011 · 2025-06-09T08:50:45Z

rdv0011
Jun 9, 2025

Issue

RegexExtractionStrategy does not extract data

Code

import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import RegexExtractionStrategy

async def extract_ethereum_exchange_wallets(url: str):
    browser_cfg = BrowserConfig(
        headless=False,  # Visible for demonstration
        verbose=True
    )
    session_id = "ethereum_wallets"
    
    # 1. Use base_wait to initially wait for dynamic content table id="address-datatable" to be loaded
    base_wait = """js:() => {
        return document.getElementById('address-datatable') !== null;
    }"""

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # Load the initial page
        initial_config = CrawlerRunConfig(
            wait_for=base_wait,
            session_id=session_id,
            cache_mode=CacheMode.BYPASS,
        )
        await crawler.arun(
            url=url,
            config=initial_config
        )
        print("Initial page loaded and address-datatable is present.")

        js_interact_rows = """
        (async function() { // Wrapped in an async immediately-invoked function expression (IIFE)
            const addressRows = document.querySelectorAll('tr.address-row');
            for (let i = 0; i < addressRows.length; i++) {
                const clickEvent = new MouseEvent('click', {
                    bubbles: true,
                    cancelable: true,
                    view: window
                });
                addressRows[i].dispatchEvent(clickEvent);
                // You might want to add a small delay here if clicks trigger new content
                await new Promise(resolve => setTimeout(resolve, 100));
            }
        })();
        """

        print("Clicking rows to expand details...")

        click_and_wait_config = CrawlerRunConfig(
            session_id=session_id,
            js_code=js_interact_rows,
            scan_full_page=True,
            js_only=True, # Continue from the open tab
            cache_mode=CacheMode.BYPASS
        )

        result = await crawler.arun(
            url=url, # URL doesn't change, but needed by arun
            config=click_and_wait_config
        )

        if not result.success:
            print("Failed to click on wallets:", result.error_message)
            return
        
        regex_pattern = r'<tr[^>]*?class="address-row"[^>]*?id="([^"]+)"[^>]*?>'

        # 2. Create the extraction strategy
        extraction_strategy = RegexExtractionStrategy(
            patterns=[regex_pattern],
            field_names=["address_row_id"], # Changed field name to reflect what's being extracted
            verbose=True
        )

        extraction_config = CrawlerRunConfig(
            session_id=session_id,
            extraction_strategy=extraction_strategy,
            scan_full_page=True,
            js_only=True, # Continue from the open tab
            cache_mode=CacheMode.BYPASS
        )

        result = await crawler.arun(
            url=url, # URL doesn't change, but needed by arun
            config=extraction_config
        )

        if not result.success:
            print("Failed to extract wallets:", result.error_message)
            return
        
        print(result.extracted_content)

async def main():
    await extract_ethereum_exchange_wallets("https://example.com")

if __name__ == "__main__":
    asyncio.run(main())

Variant with dummy HTML:

import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import RegexExtractionStrategy

async def extract_ethereum_exchange_wallets(url: str):
    browser_cfg = BrowserConfig(
        headless=False,  # Visible for demonstration
        verbose=True
    )
    session_id = "ethereum_wallets"
    
    # 1. Use base_wait to initially wait for dynamic content table id="address-datatable" to be loaded
    base_wait = """js:() => {
        return document.getElementById('address-datatable') !== null;
    }"""

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # Load the initial page
        initial_config = CrawlerRunConfig(
            wait_for=base_wait,
            session_id=session_id,
            cache_mode=CacheMode.BYPASS,
        )
        await crawler.arun(
            url=url,
            config=initial_config
        )
        print("Initial page loaded and address-datatable is present.")

        js_interact_rows = """
        (async function() { // Wrapped in an async immediately-invoked function expression (IIFE)
            const addressRows = document.querySelectorAll('tr.address-row');
            for (let i = 0; i < addressRows.length; i++) {
                const clickEvent = new MouseEvent('click', {
                    bubbles: true,
                    cancelable: true,
                    view: window
                });
                addressRows[i].dispatchEvent(clickEvent);
                // You might want to add a small delay here if clicks trigger new content
                await new Promise(resolve => setTimeout(resolve, 100));
            }
        })();
        """

        print("Clicking rows to expand details...")

        click_and_wait_config = CrawlerRunConfig(
            session_id=session_id,
            js_code=js_interact_rows,
            scan_full_page=True,
            js_only=True, # Continue from the open tab
            cache_mode=CacheMode.BYPASS
        )

        result = await crawler.arun(
            url=url, # URL doesn't change, but needed by arun
            config=click_and_wait_config
        )

        if not result.success:
            print("Failed to click on wallets:", result.error_message)
            return
        
        dummy_html_content = """
        <table>
            <tr class="address-row" id="address_A" data-code="CODE_A">
                <td>Address A</td>
            </tr>
            <tr class="chain-expand" id="wallet_A1">
                <td>Wallet A1</td>
            </tr>
            <tr class="chain-expand" some-other-attr="val" id="wallet_A2">
                <td>Wallet A2</td>
            </tr>
            <tr class="address-row" id="address_B" data-code="CODE_B">
                <td>Address B</td>
            </tr>
            <tr class="chain-expand" id="wallet_B1">
                <td>Wallet B1</td>
            </tr>
            <tr class="chain-expand" id="wallet_B2">
                <td>Wallet B2</td>
            </tr>
            <tr class="chain-expand" id="wallet_B3">
                <td>Wallet B3</td>
            </tr>
            <tr class="address-row" id="address_C" data-code="CODE_C">
                <td>Address C</td>
            </tr>
            <tr class="chain-expand" id="wallet_C1">
                <td>Wallet C1</td>
            </tr>
            <tr> <td>Some other row</td>
            </tr>
            <tr class="chain-expand-different" id="not_this_one"> <td>Different class</td>
            </tr>
            <tr class="address-row" data-code="CODE_D"> <td>Address D (No ID)</td>
            </tr>
        </table>
        """
        
        regex_pattern = r'<tr[^>]*?class="address-row"[^>]*?id="([^"]+)"[^>]*?>'

        # 2. Create the extraction strategy
        extraction_strategy = RegexExtractionStrategy(
            patterns=[regex_pattern],
            field_names=["address_row_id"], # Changed field name to reflect what's being extracted
            verbose=True
        )

        extraction_config = CrawlerRunConfig(
            extraction_strategy=extraction_strategy,
            cache_mode=CacheMode.BYPASS
        )

        result = await crawler.arun(
            url=url, # URL doesn't change, but needed by arun
            html_content=dummy_html_content,
            config=extraction_config
        )

        if not result.success:
            print("Failed to extract wallets:", result.error_message)
            return
        
        print(result.extracted_content)

async def main():
    await extract_ethereum_exchange_wallets("https://example.com")

if __name__ == "__main__":
    asyncio.run(main())

Actual result:

[]

Expected result:

A dictionary of wallet Ids

Module version

Name: Crawl4AI
Version: 0.6.3
Summary: 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper
Home-page: https://github.com/unclecode/crawl4ai
Author: Unclecode
Author-email: Unclecode [email protected]
License-Expression: Apache-2.0
Location: /opt/homebrew/anaconda3/envs/ml/lib/python3.12/site-packages

Answered by ntohidi

Jun 10, 2025

hi @rdv0011 , there are 2 small issues with the regex:

1/ It was looking for class="address-row" exactly, but the real HTML had other classes, too (like class="address-row even").
2/ It was trying to grab an id attribute from the row, but the rows didn't have one. They had a data-code attribute instead.

The fix is to tweak the regex to handle both of those things. I also combined the steps into a single arun() call, which is a bit cleaner.

You can try this regex:

regex_pattern = r'<tr[^>]*?class="[^"]*\baddress-row\b[^"]*"[^>]*?data-code="([^"]+)"[^>]*?>'

And you only need to pass the custom parameter in RegexExtractionStrategy, not the other parameters:

extraction_strategy = RegexExtrac…

View full answer

rdv0011 · 2025-06-09T08:54:33Z

rdv0011
Jun 9, 2025
Author

The url I am trying with the example above is:
https://www.coincarp.com/currencies/ethereum/exchange-wallets/

0 replies

ntohidi · 2025-06-10T14:20:05Z

ntohidi
Jun 10, 2025
Collaborator Sponsor

hi @rdv0011 , there are 2 small issues with the regex:

1/ It was looking for class="address-row" exactly, but the real HTML had other classes, too (like class="address-row even").
2/ It was trying to grab an id attribute from the row, but the rows didn't have one. They had a data-code attribute instead.

The fix is to tweak the regex to handle both of those things. I also combined the steps into a single arun() call, which is a bit cleaner.

You can try this regex:

regex_pattern = r'<tr[^>]*?class="[^"]*\baddress-row\b[^"]*"[^>]*?data-code="([^"]+)"[^>]*?>'

And you only need to pass the custom parameter in RegexExtractionStrategy, not the other parameters:

extraction_strategy = RegexExtractionStrategy(
       custom={"wallet_code": regex_pattern},
       verbose=True
   )

0 replies

rdv0011 · 2025-06-11T08:02:29Z

rdv0011
Jun 11, 2025
Author

@ntohidi , Thank you! It started to work.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

RegexExtractionStrategy returns empty result #1204

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 3 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Select a reply

Uh oh!

Uh oh!

RegexExtractionStrategy returns empty result #1204

Uh oh!

Uh oh!

rdv0011 Jun 9, 2025

Issue

Code

Actual result:

Expected result:

Module version

Replies: 3 comments

Uh oh!

rdv0011 Jun 9, 2025 Author

Uh oh!

ntohidi Jun 10, 2025 Collaborator Sponsor

Uh oh!

Uh oh!

rdv0011 Jun 11, 2025 Author

rdv0011
Jun 9, 2025

rdv0011
Jun 9, 2025
Author

ntohidi
Jun 10, 2025
Collaborator Sponsor

rdv0011
Jun 11, 2025
Author