Python-Crawler

2017/3/15 posted in  Python

总文件

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://www.xujc.com.cn/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self,response):
        sel = Selector(response)
        sties = sel.xpath('//table')
        # for site in sties:
        #   title = site.xpath('tr/td').extract()
        #   print(site)
        #   print(title)

        #filename = 'school-%s.html' % 1
        #with open(filename, 'wb') as f:
             #f.write(contents)
        #self.log('Saved file %s' % filename)

重点关注链接and标题

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://www.xujc.com.cn/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self,response):
        sel = Selector(response)
        sties = sel.xpath('//table')

        title = sties[16].xpath('tr/td/a/text() | tr/td/a/@href | tr/td/text()').extract()
        print(sties[10])
        print(title)

日期时间

title = sties[25].xpath('tr/td/table/tr/td[@id="zb"]/table/tr/td/span/text()').extract()
        print(sties[10])
        print(title)

通知公告

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://www.xujc.com.cn/index.php?c=Article&a=idxnews&lx=notice',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self,response):
        sel = Selector(response)
        sties = sel.xpath('/html/body/table/tr')
        for site in sties:
            title = site.xpath('td/a/@href | td/a/text()').extract()
            print(site)
            print(title)

新闻中心

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://www.xujc.com.cn/index.php?c=Article&a=idxnews&lx=news',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self,response):
        sel = Selector(response)
        sties = sel.xpath('/html/body/ul/li')
        for site in sties:
            title = site.xpath('a/@href | a/text()').extract()
            print(site)
            print(title)