Merge pull request #4 from alxt17/master

update code for book118's change
OhYee · Oct 19, 2021 · 2d7ec41 · 2d7ec41
2 parents e13fe97 + c18c7fd
commit 2d7ec41
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 101 deletions.
diff --git a/README.md b/README.md
@@ -16,11 +16,12 @@
 |参数             |解释                                                                                                |必备参数|
 |:----------------|:--------------------------------------------------------------------------------------------------|:------|
 |`-h`、`--help`   |显示帮助                                                                                            |❌     |
-|`-i`、`--id`     |要下载的文件id（或网页地址）                                                                           |✔      |
-|`-o`、`--output` |文件保存名，默认是`book118.pdf`                                                                       |❌     |
+|`-u`、`--url`    |要下载的文件的网页地址                                                                           |✔      |
+|`-o`、`--output` |文件保存名，默认是文档的标题.pdf                                                                       |❌     |
 |`-p`、`--proxy`  |设置要使用的代理地址（默认使用环境变量中`HTTP_PROXY`和`HTTPS_PROXY`设置的值），可以使用`-p ''`强制设置不走代理 |❌     |
 |`-f`、`--force`  |强制重新下载，不使用缓存                                                                               |❌     |
-|`-t`、`--thread` |要使用的线程数                                                                                        |❌    |
+|`-t`、`--thread` |要使用的线程数，如不指定默认是10                                                                                        |❌    |
+|`-s`、`--safe`   |如果被服务器拒绝可以打开此选项，将强制单线程，并增加请求和下载的间隔时间                                                                                        |❌    |
 
 ## 使用模块
 
@@ -31,7 +32,7 @@ python3 -m pip install documentDownloader
 
 安装完成后即可直接使用 `documentDownloader` 命令
 
-如：`documentDownloader -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20`
+如：`documentDownloader -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20`
 
 ### 直接使用源码中的 main.py 
 
@@ -41,7 +42,7 @@ python3 -m pip install documentDownloader
 2. 安装依赖模块(Pillow、reportlab、requests) `python -m pip install -r requirements.txt`
 3. 使用 `python3 main.py` 执行
 
-如：`python main.py -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20`
+如：`python main.py -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20`
 
 **仅供学习爬虫及相关知识，请支持正版图书**  
 *虽然book118上的好多pdf也是盗版吧*
@@ -55,4 +56,5 @@ python3 -m pip install documentDownloader
 
 - 2019-01-29: Book118网站更新,更改对应部分代码. [@JodeZer](https://github.com/JodeZer)
 - 2020-01-09: 重构代码，增加多线程下载加速，允许使用代理，允许通过已有缓存直接建立pdf，自动识别图片大小生成pdf [@OhYee](https://github.com/OhYee)
-- 2020-05-25: 发布到 PyPI
+- 2020-05-25: 发布到 PyPI
+- 2021-10-18: Book118网站更新，更改部分代码； 设置默认导出pdf的文件名为文档标题； 对无法免费预览全文的文档增加提示； 调整请求间隔为2秒(实测请求间隔小于2秒很可能会返回空地址)； 增加"慢速下载"选项，防止下载过快被服务器拒绝。[@alxt17](https://github.com/alxt17)
diff --git a/book118/__init__.py b/book118/__init__.py
@@ -7,25 +7,27 @@
 
 
 def solve_argv(argv):
-    help_text = '''python main.py -i <document id>
+    help_text = '''python main.py -u <document url>
     -h --help       show help
-    -i --id         document id (required)
+    -u --url         document url (required)
     -o --output     output file (default `book118.pdf`)
     -p --proxy      proxy url (default using `http_proxy` and `https_proxy`)
     -f --force      force re-download images
     -t --thread     thread number for downloading images
+    -s --safe       limit download request in case server refuses
 '''
 
-    document_id = None
+    document_url = None
     http_proxy = os.environ.get("HTTP_PROXY")
     https_proxy = os.environ.get("HTTPS_PROXY")
     force_redownload = False
-    output_file = "./book118.pdf"
+    output_file = None
     thread_number = 10
+    safe_download = False
 
     try:
         opts, args = getopt.getopt(
-            argv, "hi:o:p:ft:", ["help", "id=", "proxy=", "output=", "force", "thread="])
+            argv, "hu:o:p:ft:s", ["help", "url=", "proxy=", "output=", "force", "thread=", "safe"])
     except getopt.GetoptError:
         print(help_text)
         sys.exit(1)
@@ -34,12 +36,8 @@ def solve_argv(argv):
         if opt in ('-h', "--help"):
             print(help_text)
             exit(0)
-        elif opt in ("-i", "--id"):
-            try:
-                document_id = int(arg)
-            except:
-                document_id = int(re.findall(
-                    r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", arg)[0])
+        elif opt in ("-u", "--url"):
+            document_url = arg
         elif opt in ("-o", "--output"):
             output_file = arg
         elif opt in ("-p", "--proxy"):
@@ -51,17 +49,20 @@ def solve_argv(argv):
                 thread_number = int(arg)
             except:
                 pass
-    if document_id == None:
+        elif opt in ("-s", "--safe"):
+            safe_download = True
+    if document_url == None:
         print(help_text)
         exit(1)
-    return (document_id, http_proxy, https_proxy,
-            force_redownload, output_file, thread_number)
+    return (document_url, http_proxy, https_proxy,
+            force_redownload, output_file, thread_number, safe_download)
 
 
 def main():
-    (document_id, http_proxy, https_proxy,
-     force_redownload, output_file, thread_number) = solve_argv(sys.argv[1:])
+    (document_url, http_proxy, https_proxy,
+     force_redownload, output_file, thread_number, safe_download) = solve_argv(sys.argv[1:])
     set_proxy(http_proxy, https_proxy)
-
-    document_download(document_id, force_redownload,
-                      output_file, thread_number)
+    if safe_download:
+        thread_number = 1
+    document_download(document_url, force_redownload,
+                      output_file, thread_number, safe_download)
diff --git a/book118/book118.py b/book118/book118.py
@@ -1,102 +1,110 @@
+import json
 import sys
 import os
 import re
 import threading
 from .thread import Thread
 from .pdf import build_pdf
 from .request import get_page
+import time
 
 
-def get_preview_page_url(document_id: int or str) -> str:
+def get_next_pages(info: dict, page: int):
     '''
-    获取Book118预览页地址
+    根据当前图片获取下六张图片地址
     '''
-    return get_page("https://max.book118.com/index.php",  {
-        "g": "Home",
-        "m": "View",
-        "a": "ViewUrl",
-        "cid": document_id,
-        "flag": 1,
-    }).text
-
-
-def get_next_page(info: object, img: str):
-    '''
-    根据当前图片获取下一张图片地址
-    '''
-    result = get_page('https://' + info["domain"] + '.book118.com/PW/GetPage/?', {
-        'f': info['Url'],
-        'img': img,
-        'isMobile': 'false',
-        'isNet': 'True',
-        'readLimit': info['ReadLimit'],
-        'furl': info['Furl']
-    }).json()
+    result_text = get_page("https://openapi.book118.com/getPreview.html?", {
+        "project_id": 1,
+        "aid": info["aid"],
+        "view_token": info["view_token"],
+        "page": str(page),
+        "callback": "jQuery",  # 这个参数其实可有可无，名字也无所谓
+        "_": int(round(time.time() * 1000))  # 时间戳
+    }).text[7:-2]  # 截掉"jQuery("和末尾的");"，得到包含预览图片地址的json
+    result = json.loads(result_text)
     return result
 
 
-def get_page_info(document_id: int or str) -> object:
+def get_page_info(document_url: str) -> dict:
     '''
     获取文档信息
     '''
-    preview_url = get_preview_page_url(document_id)
-    preview_page = get_page("https:" + preview_url).text
-
-    info = {
-        "domain": re.findall(r'//(.*?)\..*', preview_url)[0]
-    }
-    forms = re.findall(
-        r'<input type="hidden" id="(.*?)" value="(.*?)".*?/>', preview_page)
-    for form in forms:
-        info[form[0]] = form[1]
+    preview_webpage = get_page(document_url).text
+
+    info = {"aid": re.findall(r'\d+', re.findall(r'office: {.*\n.*aid: \d+', preview_webpage)[0]),
+            # 获得office: {之后的aid，为解密后的id
+            "view_token": re.findall(r' view_token: \'.+\'', preview_webpage)[0][14:-1],
+            "title": re.findall(r' title: \'.+\'', preview_webpage)[0][9:-1],
+            "preview_page": re.findall(r' preview_page: \d+', preview_webpage)[0][15:],  # 可预览的页数
+            "actual_page": re.findall(r' actual_page: \d+', preview_webpage)[0][14:]  # 实际文章页数
+            }
+    if info["preview_page"] != info["actual_page"]:
+        print(
+            "attention! only {} of {} page(s) are free to preview(download)\n".format(info["preview_page"],
+                                                                                      info["actual_page"]))
     return info
 
 
-def get_image_list(document_id: int or str, info: object) -> list:
+def get_image_url_list(info: dict, safe_download: bool) -> list:
     '''
     获取文档预览图片地址列表
     '''
-    img_list = []
-    while 1:
-        json_data = get_next_page(info, ""if len(
-            img_list) == 0 else img_list[-1])
-        print("Get image url {}/{} {}".format(
-            json_data["PageIndex"], json_data["PageCount"], json_data["NextPage"]
-        ))
-        img_list.append(json_data["NextPage"])
-        if json_data["PageCount"] <= json_data["PageIndex"]:
-            break
-    return img_list
-
-
-def document_download(document_id: int or str,  force_redownload: bool,
-                      output_file: str, thread_number: int):
-    domain = ""
-    img_list = []
+    current_page = 1
+    retry = 5
+    img_url_list = []
+    while current_page <= int(info["preview_page"]):
+        json_data = get_next_pages(info, current_page)
+        for value in json_data["data"].values():
+            if value == '':
+                if retry == 0:
+                    print("Cannot get correct response, too many fails.")
+                    sys.exit(1)
+                print("Empty response, retrying {}. Consider using -s(--safe) to limit request frequency".format(retry))
+                time.sleep(1)
+                retry -= 1
+                break
+            print("Getting image url {}/{}".format(current_page, info["preview_page"]))
+            img_url_list.append("https:" + value)
+            current_page += 1
+            retry = 5
+        if safe_download:
+            time.sleep(3)
+        else:
+            time.sleep(2)
+    return img_url_list
+
+
+def document_download(document_url: str, force_redownload: bool,
+                      output_file: str, thread_number: int, safe_download: bool):
+    document_id = int(re.findall(
+        r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", document_url)[0])
     temp_dir = "./temp/{}".format(document_id)
     temp_file = "{}/{}".format(temp_dir, "img_list")
 
     if force_redownload or not os.path.exists(temp_file):
-        info = get_page_info(document_id)
-        domain = info["domain"]
-        img_list = get_image_list(document_id, info)
+        info = get_page_info(document_url)
+        if output_file is None:
+            output_file = info["title"]
+        img_url_list = get_image_url_list(info, safe_download)
+
         if not os.path.exists("./temp/"):
             os.mkdir("./temp/")
         if not os.path.exists(temp_dir):
             os.mkdir(temp_dir)
         with open(temp_file, 'w') as f:
-            f.write(info["domain"]+"\n")
-            [f.write(img+"\n") for img in img_list]
+            f.write(info["title"] + "\n")
+            [f.write(img_url + "\n") for img_url in img_url_list]
     else:
         with open(temp_file, 'r') as f:
-            img_list = f.read().split("\n")
-            domain = img_list[0]
-            img_list = img_list[1:]
-            img_list = list(filter(lambda x: len(x) > 0, img_list))
-
-    download_list = img_list if force_redownload else list(filter(
-        lambda x: not os.path.exists("{}/{}.jpg".format(temp_dir, x)),
-        img_list
+            img_url_list = f.read().split("\n")
+            if output_file is None:
+                output_file = img_url_list[0]
+            img_url_list = img_url_list[1:]
+            img_url_list = list(filter(lambda x: len(x) > 0, img_url_list))
+
+    download_list = img_url_list if force_redownload else list(filter(
+        lambda x: not os.path.exists("{}/{}".format(temp_dir, x[x.rfind('/') + 1:])),
+        img_url_list
     ))
 
     print(download_list)
@@ -116,11 +124,12 @@ def task_pool() -> str:
         lock.release()
         return img
 
-    def download_image(thread_id: int, img: str):
-        with open("{}/{}.jpg".format(temp_dir, img), "wb") as f:
-            print("Thread", thread_id, "download image", img)
-            f.write(get_page('http://' + domain +
-                             '.book118.com/img/?', {'img': img}).content)
+    def download_image(thread_id: int, img_url: str):
+        with open("{}/{}".format(temp_dir, img_url[img_url.rfind('/') + 1:]), "wb") as f:
+            print("Thread", thread_id, "download image", img_url)
+            f.write(get_page(img_url).content)
+        if safe_download:
+            time.sleep(1)
 
     threads = [Thread(i, download_image, task_pool)
                for i in range(thread_number)]
@@ -130,8 +139,8 @@ def download_image(thread_id: int, img: str):
         thread.join()
     del threads
 
-    print("Downloading images finished. Generate PDF file ", output_file)
+    print("Downloading images finished. Generate PDF file ", output_file + '.pdf')
 
-    build_pdf(temp_dir, img_list, output_file)
+    build_pdf(temp_dir, img_url_list, output_file)
 
-    print("Generating PDF file finished. File name ", output_file)
+    print("Generating PDF file finished. File name ", output_file + '.pdf')
diff --git a/book118/pdf.py b/book118/pdf.py
@@ -1,11 +1,15 @@
 from PIL import Image
 from reportlab.pdfgen import canvas
-
+import os
+import sys
 
 def build_pdf(temp_dir: str, img_list: list, output_file: str):
-    pdf = canvas.Canvas(output_file)
+    pdf = canvas.Canvas('./' + output_file + '.pdf')
     for img in img_list:
-        file_path = "{}/{}.jpg".format(temp_dir, img)
+        file_path = "{}/{}".format(temp_dir, img[img.rfind('/') + 1:])
+        if not os.path.exists(file_path):
+            print("Some images missing, please use -f(--force) to force re-download")
+            sys.exit(1)
         w, h = Image.open(file_path).size
         pdf.setPageSize((w, h))
         pdf.drawImage(file_path, 0, 0, w, h)