banner ad

Scrapy crawl the data stored in MYSQL

| August 9, 2013 | 0 Comments
0 Flares 0 Flares ×

scrapy

Example : Scrapy crawl data classification of the first page books titles and links, and stored in the database.

1 To crawl documents defined in items.py, we want to crawl the book’s name and link

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from second.items import bbs

class bbsSpider(BaseSpider):
    name = "boat"
    allow_domains = ["http://book.example.com/tag/cate?type=S"]
    start_urls = ["http://book.exapmle.com/tag/cate?type=S"]
    def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = bbs()
item['title'] = hxs.select('//ul/li[position()>0]/div[2]/h2/a/@title').extract()
item['link'] = hxs.select('//ul/li[position()>0]/div[2]/h2/a/@href').extract()
items.append(item)
return items

2.pipelines

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html

from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.contrib.pipeline.images import ImagesPipeline
import time
import MySQLdb
import MySQLdb.cursors
import socket
import select
import sys
import os
import errno
class MySQLStorePipeline(object):
    def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db = 'test',
                user = 'root',
                passwd = 'root',
                cursorclass = MySQLdb.cursors.DictCursor,
                charset = 'utf8',
                use_unicode = False
        )
    def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
return item
    def _conditional_insert(self, tx, item):
if item.get('title'):
for i in range(len(item['title'])):
tx.execute('insert into book values (%s, %s)', (item['title'][i], item['link'][i]))
Download PDF
0 Flares Twitter 0 Facebook 0 Google+ 0 Pin It Share 0 LinkedIn 0 Reddit 0 StumbleUpon 0 0 Flares ×

Tags: ,

Category: Uncategorized

About the Author ()

My name is John Link.I am 26 years old. My major is Computer science and technology. I am a junior programmer with Python.

Leave a Reply

Your email address will not be published. Required fields are marked *

0 Flares Twitter 0 Facebook 0 Google+ 0 Pin It Share 0 LinkedIn 0 Reddit 0 StumbleUpon 0 0 Flares ×