Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!

  • A+

Python + pyspider某小说站的爬虫,入数据库(方便某些小说需要直接入库的),火车头发布(方便已有火车头接口的发布),资源下载到本地(封面图片),另可写爬虫!

目前测试比较完美,小说强制从第一章节开始到最后,也不存在乱序!P

Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!

 

 

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-05-05 21:43:11
# Project: XiaoShuo

from pyspider.libs.base_handler import *
import pymysql
import random
import datetime
import urllib2,HTMLParser,re
import os
import sys
import re
import codecs
import requests
import json

class Handler(BaseHandler):
    global Datos
    global P_dir    
    P_dir = '/Tools/Debug/'  #采集时候图片保持到本地的路径
    global Datos
    Datos = {}
    headers= {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
    }
    crawl_config = {
        'headers' : headers,
        'timeout' : 300
    }
    def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
        db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
        try:
            cursor = db.cursor()
            #注意此处字符串的占位符要加双引号"%s"
            sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
#            print(sql)
            cursor.execute(sql)
            
            #qid = cursor.lastrowid
            #print(qid)
            
            db.commit()
        except Exception as err:
            print("Error %s for execute sql: %s" % (err, sql))
            db.rollback()
    def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
        db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
        try:
            cursor = db.cursor()
            #注意此处字符串的占位符要加双引号"%s"
            sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
#            print(sql)
            cursor.execute(sql)
            
            #qid = cursor.lastrowid
            #print(qid)
            
            db.commit()
        except Exception as err:
            print("Error %s for execute sql: %s" % (err, sql))
            db.rollback()
    def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
        db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
        try:
            cursor = db.cursor()
            #注意此处字符串的占位符要加双引号"%s"
            sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
            print(sql)
            cursor.execute(sql)
            print(cursor.lastrowid)
            db.commit()
        except Exception as err:
#        except:
#            print('Failed')
            print("Error %s for execute sql: %s" % (err, sql))
            db.rollback()
        
    def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 
            reload(sys)
            sys.setdefaultencoding("gbk")
            locoy_url = 'http://www.******.net/locoy/?my=book'  #697火车头发接口地址
            locoy_data = {
            'my_u':'用户名',   #后台用户名
            'my_p':'密码',   #后台密码
            'subject_669977_net':Bookname.encode('gbk', 'ignore'),
            'caid':Cater_Name.encode('gbk', 'ignore'),
            'title_669977_net':Booktitle.encode('gbk', 'ignore'),
            'article':BookConte.encode('gbk', 'ignore'),
            'author':Book_author.encode('gbk', 'ignore'),
            'ready_1':Book_Palabras.encode('gbk', 'ignore'),
            'thumb':Book_img,
            'content':Book_Introduction.encode('gbk', 'ignore'),
            'abover':abover.encode('gbk', 'ignore')           
                }
            res = requests.post(locoy_url, data=locoy_data)
            print res.text
            print res.content
#            print Dsd
            return res
    
    def __init__(self):
        self.base_url1 = 'https://www.****.cc/'
        self.base_url2 = '/'
        self.CaterId = []
        self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
        self.page_num = 1
        self.total_num = 200   

    @every(minutes=8 * 60)
    def on_start(self):
        global Cater_Name
        Cater_Name = []
        while self.page_num <= self.total_num: 
            for self.CaterId in self.CaterIds:
                if self.CaterId  == 'xuanhuan':
                     Cater_Name = '玄幻'
                if self.CaterId  == 'wuxia':
                    Cater_Name = '武侠'
                if self.CaterId  == 'lishi':
                    Cater_Name = '历史'            
                if self.CaterId  == 'yanqing':
                    Cater_Name = '都市' 
                if self.CaterId  == 'nvsheng':
                    Cater_Name = '都市' 
                if self.CaterId  == 'kehuan':
                    Cater_Name = '科幻' 
                if self.CaterId  == 'kongbu':
                    Cater_Name = '游戏' 
                print self.CaterId
                url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"          
                self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
            self.page_num += 1 
            
    def list_Caterg(self, response):
        Cater_Name = response.save
        for each in response.doc('.pic-list a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
            
    def list_Caterg_detail(self, response):
        Cater_Name = response.save
#        print Cater_Name
        Bookname = response.doc('h1').text()
        print Bookname
        Book_author = response.doc('.authorname > a').text()
#        print Book_author
        Book_Introduction = response.doc('.book-intro > div').text()
#        print Book_Introduction
        Book_Synopsis = response.doc('b').eq(1).text()
#        print Book_Synopsis
        Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
#        print Book_Palabras
        BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0]   #小说ID
#        print BookIDs
        Book_Dates = str(datetime.datetime.now())         
        for imgs in response.doc('.bigpic > img[src^="http"]').items():
            img = imgs.attr.src
            print img
                #小说封面下载
            extension = self.getExtension(img)
            name = self.getname(img)
            file_name = name + "." + extension
            imgDir = P_dir + name
            Locaimg = imgDir + "/" + file_name
            print Locaimg
            if(self.download(P_dir, imgDir, file_name, img)):   #这2行可注译,图片下载到本地
                print('attachment url is ' + img)               #
            Datos = {
                    "Cater_Name":Cater_Name,
                    "Book_author":Book_author,
                    "Book_Introduction":Book_Introduction,
                    "Book_Synopsis":Book_Synopsis,
                    "Book_Palabras":Book_Palabras,
                    "img":img,
                }
            self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates)  #这行可注译,数据库发布接口,方便其他系统的发布
        for each in response.doc('div[class="bookbtn-txt"]  a[class="catalogbtn"]').items():
            self.crawl(each.attr.href, callback=self.index_page,save=Datos)
            
    @config(age=8 * 60 * 60)    
    def index_page(self, response): 
        Datos = {
                  "Cater_Name":response.save['Cater_Name'],
                   "Book_author":response.save['Book_author'],
                   "Book_Introduction":response.save['Book_Introduction'],
                   "Book_Synopsis":response.save['Book_Synopsis'],
                   "Book_Palabras":response.save['Book_Palabras'],
                   "img":response.save['img'],
                     }
        for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
#        for each in response.doc('.chapter-list  a[href^="http"]').items():  
                    self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
    @config(priority=2)
    @catch_status_code_error
    def detail_page(self, response):        
        NewRe1 = u'哈书'
        NewRe2 = u'huhjsd.CC'
        NewRe3 = r'^\\n\\n'
        NewRe5 = u'小说网'
        NewRe6 = u'fgdfgf'
        NewRe7 = u'fgfgf'
        NewRe8 = u'ffhgf'
        NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
        ReC1 = u'静思'
        ReC2 = u'aghgf.com'
        ReC3 = u'aghgfh.com'
        ReC4 = u''
        ReC5 = u'文学网'
        ReC6 = r'<BR>'
        Bookname = response.doc('.readlocation a').eq(2).text()   #小说名称
        print Bookname
        Cater_Name = response.save['Cater_Name']   # 小说分类
        Book_author = response.save['Book_author']   #小说作者
        Book_Introduction1 = response.save['Book_Introduction']   #小说简介
        Book_Synopsis = response.save['Book_Synopsis']   #最近更新
        Book_Palabras = response.save['Book_Palabras']   #小说字数
        Bookurl = response.url   #小说网址
        Booktitle = response.doc('.article-title').text()   #章节名称
        BookID = response.doc('.readset-r span').text()   #小说ID
        BookConte1 = response.doc('.article-con').text()   #小说章节内容
        abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction']   #小说状态(连载还是完成)
        Book_Date = str(datetime.datetime.now())    # 采集时间
        BookConte2 = BookConte1.replace(NewRe1 , ReC1)
        BookConte3 = BookConte2.replace(NewRe2 , ReC2)
        BookConte5 = BookConte3.replace(NewRe5 , ReC5)
        BookConte6 = BookConte5.replace(NewRe6 , ReC2)
        BookConte7 = BookConte6.replace(NewRe7 , ReC2)
        BookConte8 = BookConte7.replace(NewRe3 , ReC6)
        BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
        BookConte = BookConte4.replace("\n\n","<br>")
        print BookConte
        Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
        Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
        Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
        Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
        Titleid = response.url.split(BookID + "/")[-1].split("/")[0]     
        Book_img = response.save['img'],  #小说图片
             
        #insert into MySQL 小说入库
        self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date)   #这行可注译,数据库发布接口,方便其他系统的发布
        self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date)   #这行可注译,数据库发布接口,方便其他系统的发布
        #post提交发布
        self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover)  #这行可注译,火车头发布接口,不需要可取消
        Datos = {
                  "Cater_Name":response.save['Cater_Name'],
                   "Book_author":response.save['Book_author'],
                   "Book_Introduction":response.save['Book_Introduction'],
                   "Book_Synopsis":response.save['Book_Synopsis'],
                   "Book_Palabras":response.save['Book_Palabras'],
                   "img":response.save['img'],
                     }
        for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
            self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 
        return {
            "Cater_Name":Cater_Name,
            "Bookname":Bookname,
            "Book_author":Book_author,
            "Book_Introduction":Book_Introduction,
            "Book_Synopsis":Book_Synopsis,
            "Book_Palabras":Book_Palabras,
            "Book_img":Book_img,
            "Bookurl": response.url,
            "Booktitle": Booktitle,
            "BookID": BookID,
            "BookConte": BookConte,
            "Titleid": Titleid,
            "abover":abover,
#            "Book_Date" = str(datetime.datetime.now()),
        }
    def download(self, P_dir, imgDir, file_name, Book_img):
        if not os.path.exists(imgDir): 
            os.makedirs(imgDir)
        file = imgDir + "/" + file_name
#        print file
        f = open(file, 'wb+')
        imag = requests.get(Book_img) 
        f.write(imag.content)
        f.close()
        #保存图片前
    def save_imgs(self,response):
        content = response.content
        file_name = response.save["file_name"]
        imgDir = response.save["imgDir"]
        file_path = imgDir + file_name
        self.save_img(content,imgDir,file_path)
    #保存图片
    def save_img(self,content,imgDir,path):
        if not os.path.exists(imgDir):                         
            os.makedirs(imgDir)
        f = open(path,"wb" )
        f.write(content)
        f.close()
    #获取url后缀名
    def getExtension(self,url):                            
        extension = url.split(".")[-1]
        return extension 
    
    #获取图片名
    def getname(self,url):
        name=url.split("/")[-1].split(".")[0]
        return name
weinxin
QQ群:20764411
QQ群扫码加群