diff --git a/README.md b/README.md index acb1367..ac96858 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,12 @@ |参数 |解释 |必备参数| |:----------------|:--------------------------------------------------------------------------------------------------|:------| |`-h`、`--help` |显示帮助 |❌ | -|`-i`、`--id` |要下载的文件id(或网页地址) |✔ | -|`-o`、`--output` |文件保存名,默认是`book118.pdf` |❌ | +|`-u`、`--url` |要下载的文件的网页地址 |✔ | +|`-o`、`--output` |文件保存名,默认是文档的标题.pdf |❌ | |`-p`、`--proxy` |设置要使用的代理地址(默认使用环境变量中`HTTP_PROXY`和`HTTPS_PROXY`设置的值),可以使用`-p ''`强制设置不走代理 |❌ | |`-f`、`--force` |强制重新下载,不使用缓存 |❌ | -|`-t`、`--thread` |要使用的线程数 |❌ | +|`-t`、`--thread` |要使用的线程数,如不指定默认是10 |❌ | +|`-s`、`--safe` |如果被服务器拒绝可以打开此选项,将强制单线程,并增加请求和下载的间隔时间 |❌ | ## 使用模块 @@ -31,7 +32,7 @@ python3 -m pip install documentDownloader 安装完成后即可直接使用 `documentDownloader` 命令 -如:`documentDownloader -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20` +如:`documentDownloader -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20` ### 直接使用源码中的 main.py @@ -41,7 +42,7 @@ python3 -m pip install documentDownloader 2. 安装依赖模块(Pillow、reportlab、requests) `python -m pip install -r requirements.txt` 3. 使用 `python3 main.py` 执行 -如:`python main.py -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20` +如:`python main.py -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20` **仅供学习爬虫及相关知识,请支持正版图书** *虽然book118上的好多pdf也是盗版吧* @@ -55,4 +56,5 @@ python3 -m pip install documentDownloader - 2019-01-29: Book118网站更新,更改对应部分代码. [@JodeZer](https://github.com/JodeZer) - 2020-01-09: 重构代码,增加多线程下载加速,允许使用代理,允许通过已有缓存直接建立pdf,自动识别图片大小生成pdf [@OhYee](https://github.com/OhYee) -- 2020-05-25: 发布到 PyPI \ No newline at end of file +- 2020-05-25: 发布到 PyPI +- 2021-10-18: Book118网站更新,更改部分代码; 设置默认导出pdf的文件名为文档标题; 对无法免费预览全文的文档增加提示; 调整请求间隔为2秒(实测请求间隔小于2秒很可能会返回空地址); 增加"慢速下载"选项,防止下载过快被服务器拒绝。[@alxt17](https://github.com/alxt17) \ No newline at end of file diff --git a/book118/__init__.py b/book118/__init__.py index 09f0b4a..c33d9c9 100644 --- a/book118/__init__.py +++ b/book118/__init__.py @@ -7,25 +7,27 @@ def solve_argv(argv): - help_text = '''python main.py -i + help_text = '''python main.py -u -h --help show help - -i --id document id (required) + -u --url document url (required) -o --output output file (default `book118.pdf`) -p --proxy proxy url (default using `http_proxy` and `https_proxy`) -f --force force re-download images -t --thread thread number for downloading images + -s --safe limit download request in case server refuses ''' - document_id = None + document_url = None http_proxy = os.environ.get("HTTP_PROXY") https_proxy = os.environ.get("HTTPS_PROXY") force_redownload = False - output_file = "./book118.pdf" + output_file = None thread_number = 10 + safe_download = False try: opts, args = getopt.getopt( - argv, "hi:o:p:ft:", ["help", "id=", "proxy=", "output=", "force", "thread="]) + argv, "hu:o:p:ft:s", ["help", "url=", "proxy=", "output=", "force", "thread=", "safe"]) except getopt.GetoptError: print(help_text) sys.exit(1) @@ -34,12 +36,8 @@ def solve_argv(argv): if opt in ('-h', "--help"): print(help_text) exit(0) - elif opt in ("-i", "--id"): - try: - document_id = int(arg) - except: - document_id = int(re.findall( - r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", arg)[0]) + elif opt in ("-u", "--url"): + document_url = arg elif opt in ("-o", "--output"): output_file = arg elif opt in ("-p", "--proxy"): @@ -51,17 +49,20 @@ def solve_argv(argv): thread_number = int(arg) except: pass - if document_id == None: + elif opt in ("-s", "--safe"): + safe_download = True + if document_url == None: print(help_text) exit(1) - return (document_id, http_proxy, https_proxy, - force_redownload, output_file, thread_number) + return (document_url, http_proxy, https_proxy, + force_redownload, output_file, thread_number, safe_download) def main(): - (document_id, http_proxy, https_proxy, - force_redownload, output_file, thread_number) = solve_argv(sys.argv[1:]) + (document_url, http_proxy, https_proxy, + force_redownload, output_file, thread_number, safe_download) = solve_argv(sys.argv[1:]) set_proxy(http_proxy, https_proxy) - - document_download(document_id, force_redownload, - output_file, thread_number) + if safe_download: + thread_number = 1 + document_download(document_url, force_redownload, + output_file, thread_number, safe_download) diff --git a/book118/book118.py b/book118/book118.py index 40828d9..53a0a5a 100644 --- a/book118/book118.py +++ b/book118/book118.py @@ -1,3 +1,4 @@ +import json import sys import os import re @@ -5,98 +6,105 @@ from .thread import Thread from .pdf import build_pdf from .request import get_page +import time -def get_preview_page_url(document_id: int or str) -> str: +def get_next_pages(info: dict, page: int): ''' - 获取Book118预览页地址 + 根据当前图片获取下六张图片地址 ''' - return get_page("https://max.book118.com/index.php", { - "g": "Home", - "m": "View", - "a": "ViewUrl", - "cid": document_id, - "flag": 1, - }).text - - -def get_next_page(info: object, img: str): - ''' - 根据当前图片获取下一张图片地址 - ''' - result = get_page('https://' + info["domain"] + '.book118.com/PW/GetPage/?', { - 'f': info['Url'], - 'img': img, - 'isMobile': 'false', - 'isNet': 'True', - 'readLimit': info['ReadLimit'], - 'furl': info['Furl'] - }).json() + result_text = get_page("https://openapi.book118.com/getPreview.html?", { + "project_id": 1, + "aid": info["aid"], + "view_token": info["view_token"], + "page": str(page), + "callback": "jQuery", # 这个参数其实可有可无,名字也无所谓 + "_": int(round(time.time() * 1000)) # 时间戳 + }).text[7:-2] # 截掉"jQuery("和末尾的");",得到包含预览图片地址的json + result = json.loads(result_text) return result -def get_page_info(document_id: int or str) -> object: +def get_page_info(document_url: str) -> dict: ''' 获取文档信息 ''' - preview_url = get_preview_page_url(document_id) - preview_page = get_page("https:" + preview_url).text - - info = { - "domain": re.findall(r'//(.*?)\..*', preview_url)[0] - } - forms = re.findall( - r'', preview_page) - for form in forms: - info[form[0]] = form[1] + preview_webpage = get_page(document_url).text + + info = {"aid": re.findall(r'\d+', re.findall(r'office: {.*\n.*aid: \d+', preview_webpage)[0]), + # 获得office: {之后的aid,为解密后的id + "view_token": re.findall(r' view_token: \'.+\'', preview_webpage)[0][14:-1], + "title": re.findall(r' title: \'.+\'', preview_webpage)[0][9:-1], + "preview_page": re.findall(r' preview_page: \d+', preview_webpage)[0][15:], # 可预览的页数 + "actual_page": re.findall(r' actual_page: \d+', preview_webpage)[0][14:] # 实际文章页数 + } + if info["preview_page"] != info["actual_page"]: + print( + "attention! only {} of {} page(s) are free to preview(download)\n".format(info["preview_page"], + info["actual_page"])) return info -def get_image_list(document_id: int or str, info: object) -> list: +def get_image_url_list(info: dict, safe_download: bool) -> list: ''' 获取文档预览图片地址列表 ''' - img_list = [] - while 1: - json_data = get_next_page(info, ""if len( - img_list) == 0 else img_list[-1]) - print("Get image url {}/{} {}".format( - json_data["PageIndex"], json_data["PageCount"], json_data["NextPage"] - )) - img_list.append(json_data["NextPage"]) - if json_data["PageCount"] <= json_data["PageIndex"]: - break - return img_list - - -def document_download(document_id: int or str, force_redownload: bool, - output_file: str, thread_number: int): - domain = "" - img_list = [] + current_page = 1 + retry = 5 + img_url_list = [] + while current_page <= int(info["preview_page"]): + json_data = get_next_pages(info, current_page) + for value in json_data["data"].values(): + if value == '': + if retry == 0: + print("Cannot get correct response, too many fails.") + sys.exit(1) + print("Empty response, retrying {}. Consider using -s(--safe) to limit request frequency".format(retry)) + time.sleep(1) + retry -= 1 + break + print("Getting image url {}/{}".format(current_page, info["preview_page"])) + img_url_list.append("https:" + value) + current_page += 1 + retry = 5 + if safe_download: + time.sleep(3) + else: + time.sleep(2) + return img_url_list + + +def document_download(document_url: str, force_redownload: bool, + output_file: str, thread_number: int, safe_download: bool): + document_id = int(re.findall( + r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", document_url)[0]) temp_dir = "./temp/{}".format(document_id) temp_file = "{}/{}".format(temp_dir, "img_list") if force_redownload or not os.path.exists(temp_file): - info = get_page_info(document_id) - domain = info["domain"] - img_list = get_image_list(document_id, info) + info = get_page_info(document_url) + if output_file is None: + output_file = info["title"] + img_url_list = get_image_url_list(info, safe_download) + if not os.path.exists("./temp/"): os.mkdir("./temp/") if not os.path.exists(temp_dir): os.mkdir(temp_dir) with open(temp_file, 'w') as f: - f.write(info["domain"]+"\n") - [f.write(img+"\n") for img in img_list] + f.write(info["title"] + "\n") + [f.write(img_url + "\n") for img_url in img_url_list] else: with open(temp_file, 'r') as f: - img_list = f.read().split("\n") - domain = img_list[0] - img_list = img_list[1:] - img_list = list(filter(lambda x: len(x) > 0, img_list)) - - download_list = img_list if force_redownload else list(filter( - lambda x: not os.path.exists("{}/{}.jpg".format(temp_dir, x)), - img_list + img_url_list = f.read().split("\n") + if output_file is None: + output_file = img_url_list[0] + img_url_list = img_url_list[1:] + img_url_list = list(filter(lambda x: len(x) > 0, img_url_list)) + + download_list = img_url_list if force_redownload else list(filter( + lambda x: not os.path.exists("{}/{}".format(temp_dir, x[x.rfind('/') + 1:])), + img_url_list )) print(download_list) @@ -116,11 +124,12 @@ def task_pool() -> str: lock.release() return img - def download_image(thread_id: int, img: str): - with open("{}/{}.jpg".format(temp_dir, img), "wb") as f: - print("Thread", thread_id, "download image", img) - f.write(get_page('http://' + domain + - '.book118.com/img/?', {'img': img}).content) + def download_image(thread_id: int, img_url: str): + with open("{}/{}".format(temp_dir, img_url[img_url.rfind('/') + 1:]), "wb") as f: + print("Thread", thread_id, "download image", img_url) + f.write(get_page(img_url).content) + if safe_download: + time.sleep(1) threads = [Thread(i, download_image, task_pool) for i in range(thread_number)] @@ -130,8 +139,8 @@ def download_image(thread_id: int, img: str): thread.join() del threads - print("Downloading images finished. Generate PDF file ", output_file) + print("Downloading images finished. Generate PDF file ", output_file + '.pdf') - build_pdf(temp_dir, img_list, output_file) + build_pdf(temp_dir, img_url_list, output_file) - print("Generating PDF file finished. File name ", output_file) + print("Generating PDF file finished. File name ", output_file + '.pdf') diff --git a/book118/pdf.py b/book118/pdf.py index 2a65005..643cdfe 100644 --- a/book118/pdf.py +++ b/book118/pdf.py @@ -1,11 +1,15 @@ from PIL import Image from reportlab.pdfgen import canvas - +import os +import sys def build_pdf(temp_dir: str, img_list: list, output_file: str): - pdf = canvas.Canvas(output_file) + pdf = canvas.Canvas('./' + output_file + '.pdf') for img in img_list: - file_path = "{}/{}.jpg".format(temp_dir, img) + file_path = "{}/{}".format(temp_dir, img[img.rfind('/') + 1:]) + if not os.path.exists(file_path): + print("Some images missing, please use -f(--force) to force re-download") + sys.exit(1) w, h = Image.open(file_path).size pdf.setPageSize((w, h)) pdf.drawImage(file_path, 0, 0, w, h)