小说网站爬虫

编辑 / Python / 发布于2020-07-21 / 更新于2023-03-16 / 阅读 297

1.小说爬虫简易模板

import requests
from bs4 import BeautifulSoup
import re
import os

# 本脚本无普适性,请具体网站具体分析
root = ''  # 文章首页链接
art_id = ''  # 文章链接id

fname = './txts/'+art_id[5:-1]+'.txt'
encode = 'gbk'


def GetAllSection(root, id):
    r = requests.get(root + id)
    r.encoding = encode
    bs = BeautifulSoup(r.text, 'html.parser')
    url_a = bs.find_all('a')
    urls = []
    for u in url_a:
        href = u.get('href')
        if href and re.match(id, href):
            urls.append(root+href)

    del urls[0]
    return urls


def SaveArticle(root, id, fname):
    i = 1
    urls = GetAllSection(root, id)
    for url in urls:
        r = requests.get(url)
        r.encoding = encode
        bs = BeautifulSoup(r.text, 'html.parser')
        text = bs.find_all('p')[1].text
        text = text.replace('\xa0', ' ')
        text = text.replace('\ufffd', ' ')
        text = text.replace('\n\r\n', '\n')
        with open(fname, 'a', encoding=encode) as file:
            if i == 1:
                file.write(bs.h1.text+'\n')
                i = 0
            file.write(text+'\n')


if __name__ == "__main__":
    SaveArticle(root, art_id, fname)
    # print(fname)

2.道德经爬虫

自己想做一个古文阅读类的安卓app,所以文章素材的话就只能自己去网上爬了。通过分析网站链接逻辑,和网站内文章内容的布局,很容易就可以写出来。
文章保存在当前目录下。

import requests
from bs4 import BeautifulSoup
import re
import os
encode = 'utf-8'
cha_num = 81

def SaveArticle(fname, urls):
    for url in urls:
        print('-', end='')
        r = requests.get(url)
        r.encoding = encode
        bs = BeautifulSoup(r.text, 'html.parser')
        text = bs.find_all('div', {"class": "grap"})
        with open(fname, 'a', encoding=encode) as file:
            for t in text:
                te = t.text.replace('\xa0', ' ').replace(
                    '\u3000', ' ').replace('\ufffd', ' ')
                file.write(te+'\n')

if __name__ == "__main__":
    urls = ['https://www.5000yan.com/'+str(i)+'.html' for i in range(1, 82)]
    SaveArticle('daodejing.txt', urls)
    # print(urls)