Python + pyspider 小说爬虫,源码,需要的拿去用,另可写爬虫!

  • A+

自从花了2天学习了一下爬虫后,妈妈再也不用担心我没书看了!

爬虫收费代写!

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-04-17 19:28:17
# Project: Zhuishushenqi.com

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    headers= {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
    }
    crawl_config = {
        'headers' : headers,
        'timeout' : 300
    }
    def __init__(self):
        self.base_url1 = 'http://www.zhuishushenqi.com/category?gender=male&type=hot&major='
        self.base_url2 = '&minor=&page='
        self.CaterId = []
        self.CaterIds = ['1', '6', '11', '15', '20', '27', '31', '35', '41', '46', '51', '59']
        self.page_num = 1
        self.total_num = 5        
    @every(minutes=8 * 60)
    def on_start(self):
        global Cater_Name
        Cater_Name = []
        while self.page_num <= self.total_num: 
            for self.CaterId in self.CaterIds:
                if self.CaterId  == '1':
                     Cater_Name = 11
                if self.CaterId  == '6':
                    Cater_Name = 11
                if self.CaterId  == '11':
                    Cater_Name = 22
                if self.CaterId  == '15':
                    Cater_Name = 22
                if self.CaterId  == '20':
                    Cater_Name = 33            
                if self.CaterId  == '27':
                    Cater_Name = 33 
                if self.CaterId  == '31':
                    Cater_Name = 44 
                if self.CaterId  == '35':
                    Cater_Name = 44 
                if self.CaterId  == '41':
                    Cater_Name = 55 
                if self.CaterId  == '46':
                    Cater_Name = 55 
                if self.CaterId  == '51':
                    Cater_Name = 66 
                if self.CaterId  == '59':
                    Cater_Name = 66 
                print self.CaterId
                url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num)          
                self.crawl(url, callback=self.list_Caterg_detail,save=Cater_Name)
            self.page_num += 1 
    def list_Caterg_detail(self, response):
        Cater_Name = response.save
        for each in response.doc('.books-list a[href*="/book"]').items():
            self.crawl(each.attr.href, callback=self.index_page,save=Cater_Name)
        for each in response.doc('.sort-cells > a.sort-cell').items():
            print each.attr.href
            self.crawl(each.attr.href, callback=self.list_Caterg_detail)
    @config(age=8 * 60 * 60)
    def index_page(self, response):
            Cater_Name = response.save
            Bookname = response.doc('h1').text()
            Book_author = response.doc('.sup > a').text()
            Book_Introduction = response.doc('.intro').text()
            Book_Synopsis = response.doc('.sup').eq(1).text()
            Book_Palabras = response.doc('.sup  ').eq(0).text().split('|')[2]
            for imgs in response.doc('.book-info img[src^="http"]').items():
                img = imgs.attr.src
                print img
                Datos = {
                    "Cater_Name":Cater_Name,
                    "Bookname":Bookname,
                    "Book_author":Book_author,
                    "Book_Introduction":Book_Introduction,
                    "Book_Synopsis":Book_Synopsis,
                    "Book_Palabras":Book_Palabras,
                    "img":img,
                }    
            for each in response.doc('.chapter-list.hidden-list  a[href^="http"]').items():
                self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
            
    @config(priority=2)
    def detail_page(self, response):

        return {
            "Cater_Name":response.save['Cater_Name'],
            "Bookname":response.save['Bookname'],
            "Book_author":response.save['Book_author'],
            "Book_Introduction":response.save['Book_Introduction'],
            "Book_Synopsis":response.save['Book_Synopsis'],
            "Book_Palabras":response.save['Book_Palabras'],
            "img":response.save['img'],
            "BookUrl": response.url,
            "title": response.doc('.current-chapter').text(),
            "BookConte": response.doc('.inner-text').text(),
        }

weinxin
QQ群:20764411
QQ群扫码加群
avatar