-
Notifications
You must be signed in to change notification settings - Fork 15
Scrapy tutorial example using parslepy DMOZ Spider
Paul Tremberth edited this page Jun 25, 2013
·
1 revision
- http://doc.scrapy.org/en/latest/intro/tutorial.html
- https://github.com/redapple/parslepy/blob/master/parslepy/utils/scrapytools.py
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return itemsfrom scrapy.spider import BaseSpider
import parslepy
import cStringIO as StringIO
from tutorial.items import DmozItem
class ParsleyDmozSpider(BaseSpider):
name = "parsleydmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
dmozparselet = parslepy.Parselet({
"dmozitems(//ul/li)": [{
"title": "a",
"link": "a @href",
"desc": "."
}]
})
def parse(self, response):
parsed = self.dmozparselet.parse(StringIO.StringIO(response.body))
dmozitems = parsed.get("dmozitems")
if dmozitems:
return [DmozItem(**d) for d in dmozitems]from parsleycrawler.items import DmozItem
import parslepy
import cStringIO as StringIO
import parslepy.utils.scrapytools
class ParsleyDmoz2Spider(BaseSpider):
name = "parsleydmoz2"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
parselet = parslepy.Parselet(
{
"dmozitems(//ul/li)": [{
"title": "a",
"link": "a @href",
"desc": "."
}],
"dmozurls": [".//div[@class='navigate']//a[re:test(@href, '^/')]/@href"]
})
loader = parslepy.utils.scrapytools.ParsleyLoader(parselet)
item_config = parslepy.utils.scrapytools.ParsleyItemLoaderConfig(
item_class = DmozItem,
iter_item_key="dmozitems")
req_config = parslepy.utils.scrapytools.ParsleyRequestConfig(
#iter_request_key="dmozurls")
iter_request_key="dmozitems", url_getter=lambda d: d.get("link"))
def parse(self, response):
for i in self.loader.iter_items(self.item_config, response):
yield i
for r in self.loader.iter_requests(self.req_config, response):
yield r