Skip to content

Scrapy tutorial example using parslepy DMOZ Spider

Paul Tremberth edited this page Jun 25, 2013 · 1 revision

The original spider

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

from tutorial.items import DmozItem

class DmozSpider(BaseSpider):
   name = "dmoz"
   allowed_domains = ["dmoz.org"]
   start_urls = [
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
   ]

   def parse(self, response):
       hxs = HtmlXPathSelector(response)
       sites = hxs.select('//ul/li')
       items = []
       for site in sites:
           item = DmozItem()
           item['title'] = site.select('a/text()').extract()
           item['link'] = site.select('a/@href').extract()
           item['desc'] = site.select('text()').extract()
           items.append(item)
       return items

Rewritten using parslepy

from scrapy.spider import BaseSpider

import parslepy
import cStringIO as StringIO

from tutorial.items import DmozItem

class ParsleyDmozSpider(BaseSpider):
    name = "parsleydmoz"
    allowed_domains = ["dmoz.org"]
    start_urls = [
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
    ]

    dmozparselet = parslepy.Parselet({
            "dmozitems(//ul/li)": [{
                "title": "a",
                "link": "a @href",
                "desc": "."
            }]
        })

    def parse(self, response):
        parsed = self.dmozparselet.parse(StringIO.StringIO(response.body))
        dmozitems = parsed.get("dmozitems")
        if dmozitems:
            return [DmozItem(**d) for d in dmozitems]

Rewritten using parslepy.utils.scrapytools.ParsleyLoader

from parsleycrawler.items import DmozItem
import parslepy
import cStringIO as StringIO
import parslepy.utils.scrapytools
class ParsleyDmoz2Spider(BaseSpider):
    name = "parsleydmoz2"
    allowed_domains = ["dmoz.org"]
    start_urls = [
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
       "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
    ]

    parselet = parslepy.Parselet(
        {
            "dmozitems(//ul/li)": [{
                "title": "a",
                "link": "a @href",
                "desc": "."
            }],
            "dmozurls": [".//div[@class='navigate']//a[re:test(@href, '^/')]/@href"]
        })
    loader = parslepy.utils.scrapytools.ParsleyLoader(parselet)
    item_config = parslepy.utils.scrapytools.ParsleyItemLoaderConfig(
        item_class = DmozItem,
        iter_item_key="dmozitems")
    req_config = parslepy.utils.scrapytools.ParsleyRequestConfig(
        #iter_request_key="dmozurls")
        iter_request_key="dmozitems", url_getter=lambda d: d.get("link"))

    def parse(self, response):
        for i in self.loader.iter_items(self.item_config, response):
            yield i

        for r in self.loader.iter_requests(self.req_config, response):
            yield r
Clone this wiki locally