# -*- coding: UTF-8 -*-
# !/usr/bin/env python3
# Author:  Murphy

import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing import Manager
from multiprocessing import Process


class NovelCopy(object):
    def __init__(self):
        self.start_url = "https://www.ddxsku.com/full.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}

    def collect_novel(self, novel_data_queue):
        novel_number = 0
        start_response = requests.get(self.start_url, headers=self.headers)
        total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
        novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in
                                 range(1, int(total_page) + 1)]

        for novel_navigation_url in novel_navigation_urls:
            novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)

            novel_index_urls = re.findall(r'<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
            for novel_index_url in novel_index_urls:
                novel_number += 1
                novel_index_response = requests.get(novel_index_url, headers=self.headers)
                novel_index_response.encoding = "utf-8"

                novel_name = re.search(fr'.+<a href="http://www.ddxsku.com/xiaoshuo/\d+\.html">(.+)</a>.+',
                                       novel_index_response.text).group(1)
                novel_author = re.search(r'<dd><h3>作者:(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
                novel_data = [novel_number, novel_name, novel_author, []]

                index_response_soup = BeautifulSoup(novel_index_response.text, "html.parser")
                novel_text_urls = index_response_soup.find_all("td", class_="L")
                for each in novel_text_urls:
                    chapters_title = each.text
                    chapters_url = each.a["href"]
                    novel_data[3].append((chapters_title, chapters_url))

                novel_data_queue.put(novel_data)

    @staticmethod
    def change_path():
        novel_save_path = fr'{os.getcwd()}\\novel'

        if not os.path.exists(novel_save_path):
            os.mkdir(novel_save_path)
            os.chdir(novel_save_path)
        else:
            os.chdir(novel_save_path)

    def download_novel(self, novel_data_queue):
        print(fr"Start crawler:  {os.getpid()}")
        self.change_path()

        while True:
            novel_data = novel_data_queue.get()

            novel_number = novel_data[0]
            novel_name = novel_data[1]
            novel_author = novel_data[2]

            print(fr"Start download:  {novel_name}")

            novel_file_name = fr"{novel_number}-{novel_name}({novel_author}).txt"

            for chapter_data in novel_data[3]:
                chapter_title = chapter_data[0]
                chapter_url = chapter_data[1]

                chapter_response = requests.get(chapter_url, headers=self.headers)
                chapter_response.encoding = "utf-8"
                chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
                chapter_content_raw = chapter_soup.find("dd", id="contents")
                chapter_content = "\n\n" + chapter_title.center(100) + "\n" + chapter_content_raw.text  # 将章节名称放入小说文本

                with open(novel_file_name, "a+", encoding="utf-8") as f:
                    f.write(chapter_content)

                print(fr'Downloaded:  {novel_number}-{chapter_title}({novel_name})')
                sleep(0.4)
            print()

    def crawler(self):
        print(fr"Main process starts to work--{os.getpid()}")

        novel_data_queue = Manager().Queue(3)
        collect_crawler = Process(target=self.collect_novel, args=(novel_data_queue,))
        collect_crawler.start()

        pool = Pool(3)
        for i in range(3):
            pool.apply_async(self.download_novel, args=(novel_data_queue,))

        collect_crawler.join()
        pool.close()
        pool.terminate()
        print(fr"End of main process works--{os.getpid()}")


if __name__ == "__main__":
    copy = NovelCopy()
    copy.crawler()
最后修改:2020 年 04 月 29 日 08 : 28 PM
如果觉得我的文章对你有用,请支持我!