某小说网站爬虫

编辑 / Python / 发布于2022-01-16 / 更新于2023-03-16 / 阅读 215

去广告直接爬取整本本小说

没有代理可以设置proxies为空

proxies={}
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from lxml import etree
proxies={
    'http': 'http://127.0.0.1:10809',
    'https': 'http://127.0.0.1:10809'  # https -> http
    }


class Spider:
    def __init__(self,_host):
        self.host = _host

    def search(self,kw):
        res = requests.get(f'{self.host}/wap.php?action=search&wd={kw}',proxies = proxies).content
        html = etree.HTML(res)
        search_res = html.xpath('//a[@class="name"]/@href')
        name  = html.xpath('//a[@class="name"]/text()')
        # print(search_res)
        return search_res,name

    def get_all_chapter(self,url):
        res = requests.get(self.host + url,proxies = proxies).content
        html = etree.HTML(res)
        chapter_list = html.xpath('//select[@name="pagelist"]/option/@value')

        chapters = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')[3:]

        for l in chapter_list[1:]:
            res = requests.get(self.host+l,proxies = proxies).content
            html = etree.HTML(res)
            chas = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')
            chapters += chas
        return chapters

    def get_text(self, chapters):
        text = ''
        for ch in chapters:
            res = requests.get(self.host + ch,proxies = proxies).content
            html = etree.HTML(res)
            te = html.xpath('//div[@id="nr1"]/text()')
            text += ''.join(te) + '\r\n\n'
            subch = html.xpath('//center[@class="chapterPages"]/a/@href')
            print(self.host+ch)
            for ich in subch:
                print(self.host+ich)
                res = requests.get(self.host+ich,proxies = proxies).content
                html = etree.HTML(res)
                te = html.xpath('//div[@id="nr1"]/text()')
                text += ''.join(te) + '\r\n\n'  
        return text

    def download(self,kw):
        urls,names = self.search(kw)
        for i in range(len(urls)):
            url = urls[i]
            name = names[i]
            print(self.host+url,name)
            chs = self.get_all_chapter(url)
            text = self.get_text(chs)

            if not os.path.exists('downloads'):
                os.mkdir('downloads') 

            while os.path.exists(f'downloads/{name}.txt'):
                name = name + '1'

            with open(f'downloads/{name}.txt','w',encoding="utf-8") as f:
                f.write(text)

if __name__ == "__main__":
 

    sp = Spider('') #网站url
    sp.download('') # 小说名称