去广告直接爬取整本本小说
没有代理可以设置proxies为空
proxies={}
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from lxml import etree
proxies={
'http': 'http://127.0.0.1:10809',
'https': 'http://127.0.0.1:10809' # https -> http
}
class Spider:
def __init__(self,_host):
self.host = _host
def search(self,kw):
res = requests.get(f'{self.host}/wap.php?action=search&wd={kw}',proxies = proxies).content
html = etree.HTML(res)
search_res = html.xpath('//a[@class="name"]/@href')
name = html.xpath('//a[@class="name"]/text()')
# print(search_res)
return search_res,name
def get_all_chapter(self,url):
res = requests.get(self.host + url,proxies = proxies).content
html = etree.HTML(res)
chapter_list = html.xpath('//select[@name="pagelist"]/option/@value')
chapters = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')[3:]
for l in chapter_list[1:]:
res = requests.get(self.host+l,proxies = proxies).content
html = etree.HTML(res)
chas = html.xpath('//div[@class="mod block update chapter-list"]//li/a/@href')
chapters += chas
return chapters
def get_text(self, chapters):
text = ''
for ch in chapters:
res = requests.get(self.host + ch,proxies = proxies).content
html = etree.HTML(res)
te = html.xpath('//div[@id="nr1"]/text()')
text += ''.join(te) + '\r\n\n'
subch = html.xpath('//center[@class="chapterPages"]/a/@href')
print(self.host+ch)
for ich in subch:
print(self.host+ich)
res = requests.get(self.host+ich,proxies = proxies).content
html = etree.HTML(res)
te = html.xpath('//div[@id="nr1"]/text()')
text += ''.join(te) + '\r\n\n'
return text
def download(self,kw):
urls,names = self.search(kw)
for i in range(len(urls)):
url = urls[i]
name = names[i]
print(self.host+url,name)
chs = self.get_all_chapter(url)
text = self.get_text(chs)
if not os.path.exists('downloads'):
os.mkdir('downloads')
while os.path.exists(f'downloads/{name}.txt'):
name = name + '1'
with open(f'downloads/{name}.txt','w',encoding="utf-8") as f:
f.write(text)
if __name__ == "__main__":
sp = Spider('') #网站url
sp.download('') # 小说名称