# -*- coding: UTF-8 -*-
# !/usr/bin/env python3
# Author: Murphy
import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
class CollectNovel(object):
def __init__(self):
self.novel_data = {}
self.start_url = "https://www.ddxsku.com/full.html"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
def collect_url(self):
print("Collecting novel basic data.....")
start_response = requests.get(self.start_url, headers=self.headers)
total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in range(1, int(total_page)+1)]
for novel_navigation_url in novel_navigation_urls:
novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
novel_index_urls = re.findall(r'<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
for novel_index_url in novel_index_urls:
novel_index_response = requests.get(novel_index_url, headers=self.headers)
novel_index_response.encoding = "utf-8"
novel_name = re.search(fr'.+<a href="http://www.ddxsku.com/xiaoshuo/\d+\.html">(.+)</a>.+', novel_index_response.text).group(1)
novel_author = re.search(r'<dd><h3>作者:(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
self.novel_data = {novel_name: [("novel_author", novel_author)]}
print("Collecting novel: 《%s》--%s" % (novel_name, novel_author))
index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
novel_text_urls = index_soup.find_all("td", class_="L")
for each in novel_text_urls:
chapters_title = each.text
chapters_url = each.a["href"]
self.novel_data[novel_name].append((chapters_title, chapters_url))
sleep(1)
break # 只收集一页小说导航中的一个小说信息
break # 只收集一页小说导航中的小说信息
def novel_copy(self):
self.collect_url()
if self.novel_data:
for name in self.novel_data:
count = 0
print("Downloading: 《%s》" % name, end="\n"*2)
work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data[name][0][1])
if not os.path.exists(work_path):
os.makedirs(work_path)
os.chdir(work_path)
else:
os.chdir(work_path)
for chapter_data in self.novel_data[name][1:]:
count += 1
print("Downloading: 《%s》--%s" % (name, chapter_data[0]))
chapter_response = requests.get(chapter_data[1], headers=self.headers)
chapter_response.encoding = "utf-8"
chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
chapter_text = chapter_soup.find("dd", id="contents")
with open("%d-%s.txt" % (count, chapter_data[0]), "w", encoding="utf-8") as f:
f.write(chapter_text.text)
sleep(2)
print()
else:
print("Collect data failed")
if __name__ == "__main__":
novel = CollectNovel()
novel.novel_copy()
Last modification:April 2, 2020
© Allow specification reprint
6 comments
哈哈哈,写的太好了https://www.cscnn.com/
看的我热血沸腾啊www.jiwenlaw.com
看的我热血沸腾啊https://www.237fa.com/
想想你的文章写的特别好https://www.jiwenlaw.com/
博主真是太厉害了!!!
要不要互加友链?