多线程小说网站爬虫

编辑 / Python / 发布于2022-03-06 / 更新于2023-03-16 / 阅读 250

原始版本 https://b.mortal.live/archives/txtspider

  • 增加每章标题
  • 增加了基于线程池的多线程爬虫
  • 解决了部分网站上存在的文字和图片混合的问题,把图片转化成文字
  • 优化部分代码

import requests
from bs4 import BeautifulSoup
import re
import os
import time
from lxml import etree
import threading
from concurrent.futures import ThreadPoolExecutor

proxies={
    'http': 'http://127.0.0.1:10809',
    'https': 'http://127.0.0.1:10809'  # https -> http
    }


class Spider:
    def __init__(self,_host):
        self.host = _host

    txts = u'需要替换的图片的文字'

    def repword(self,res):
        res = res.decode('utf-8')
        res = re.sub('<em class=n_(\d+)></em>', lambda obj : self.txts[int(obj.group(1))-1], res)
        return res.encode('utf-8')


    def search(self,kw):
        res = requests.get(f'{self.host}/wap.php?action=search&wd={kw}',proxies = proxies).content
        html = etree.HTML(res)
        search_res = html.xpath('//a[@class="name"]/@href')
        name  = html.xpath('//a[@class="name"]/text()')
        # print(search_res)
        return search_res,name

    def get_all_chapter(self,url):
        res = requests.get(self.host + url,proxies = proxies).content
        html = etree.HTML(res)
        chapter_list = html.xpath('//select[@name="pagelist"]/option/@value')

        chapters = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')[3:]

        for l in chapter_list[1:]:
            res = requests.get(self.host+l,proxies = proxies).content
            html = etree.HTML(res)
            chas = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')
            chapters += chas
        return chapters

    def get_chapter_text(self,ch):
        res = requests.get(self.host + ch,proxies = proxies).content
        res = self.repword(res)
        html = etree.HTML(res)
        title = html.xpath('//h1[@class="page-title"]//text()')
        te = html.xpath('//div[@id="nr1"]//text()')

        title =''.join(title)
        text = ''.join(te) + '\r\n\n'
        subch = html.xpath('//center[@class="chapterPages"]/a/@href')
        print(self.host+ch)
        for ich in subch:
            print(self.host+ich)
            res = requests.get(self.host+ich,proxies = proxies).content
            html = etree.HTML(res)
            te = html.xpath('//div[@id="nr1"]//text()')
            text += ''.join(te) + '\r\n\n' 
        return text if title in text else title + text

    def get_text_mult(self,chapters):
        with ThreadPoolExecutor(max_workers=8) as pool:
            results = pool.map(self.get_chapter_text, chapters)
            return ''.join(results)

    def get_text(self, chapters):
        text=''
        for ch in chapters:
            text += self.get_chapter_text(ch)
        return text

    def download(self,kw):
        urls,names = self.search(kw)
        for i in range(len(urls)):
            url = urls[i]
            name = names[i]
            print(self.host+url,name)
            chs = self.get_all_chapter(url)
            # text = self.get_text(chs)
            text = self.get_text_mult(chs)
            if not os.path.exists('downloads'):
                os.mkdir('downloads') 

            while os.path.exists(f'downloads/{name}.txt'):
                name = name + '1'

            with open(f'downloads/{name}.txt','w',encoding="utf-8") as f:
                f.write(text)

if __name__ == "__main__":
 
    sp = Spider('')
    sp.download('')