原始版本 https://b.mortal.live/archives/txtspider
- 增加每章标题
- 增加了基于线程池的多线程爬虫
- 解决了部分网站上存在的文字和图片混合的问题,把图片转化成文字
- 优化部分代码
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from lxml import etree
import threading
from concurrent.futures import ThreadPoolExecutor
proxies={
'http': 'http://127.0.0.1:10809',
'https': 'http://127.0.0.1:10809' # https -> http
}
class Spider:
def __init__(self,_host):
self.host = _host
txts = u'需要替换的图片的文字'
def repword(self,res):
res = res.decode('utf-8')
res = re.sub('<em class=n_(\d+)></em>', lambda obj : self.txts[int(obj.group(1))-1], res)
return res.encode('utf-8')
def search(self,kw):
res = requests.get(f'{self.host}/wap.php?action=search&wd={kw}',proxies = proxies).content
html = etree.HTML(res)
search_res = html.xpath('//a[@class="name"]/@href')
name = html.xpath('//a[@class="name"]/text()')
# print(search_res)
return search_res,name
def get_all_chapter(self,url):
res = requests.get(self.host + url,proxies = proxies).content
html = etree.HTML(res)
chapter_list = html.xpath('//select[@name="pagelist"]/option/@value')
chapters = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')[3:]
for l in chapter_list[1:]:
res = requests.get(self.host+l,proxies = proxies).content
html = etree.HTML(res)
chas = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')
chapters += chas
return chapters
def get_chapter_text(self,ch):
res = requests.get(self.host + ch,proxies = proxies).content
res = self.repword(res)
html = etree.HTML(res)
title = html.xpath('//h1[@class="page-title"]//text()')
te = html.xpath('//div[@id="nr1"]//text()')
title =''.join(title)
text = ''.join(te) + '\r\n\n'
subch = html.xpath('//center[@class="chapterPages"]/a/@href')
print(self.host+ch)
for ich in subch:
print(self.host+ich)
res = requests.get(self.host+ich,proxies = proxies).content
html = etree.HTML(res)
te = html.xpath('//div[@id="nr1"]//text()')
text += ''.join(te) + '\r\n\n'
return text if title in text else title + text
def get_text_mult(self,chapters):
with ThreadPoolExecutor(max_workers=8) as pool:
results = pool.map(self.get_chapter_text, chapters)
return ''.join(results)
def get_text(self, chapters):
text=''
for ch in chapters:
text += self.get_chapter_text(ch)
return text
def download(self,kw):
urls,names = self.search(kw)
for i in range(len(urls)):
url = urls[i]
name = names[i]
print(self.host+url,name)
chs = self.get_all_chapter(url)
# text = self.get_text(chs)
text = self.get_text_mult(chs)
if not os.path.exists('downloads'):
os.mkdir('downloads')
while os.path.exists(f'downloads/{name}.txt'):
name = name + '1'
with open(f'downloads/{name}.txt','w',encoding="utf-8") as f:
f.write(text)
if __name__ == "__main__":
sp = Spider('')
sp.download('')