Skip to content

Commit

Permalink
Merge pull request #4 from alxt17/master
Browse files Browse the repository at this point in the history
update code for book118's change
  • Loading branch information
OhYee authored Oct 19, 2021
2 parents e13fe97 + c18c7fd commit 2d7ec41
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 101 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
|参数 |解释 |必备参数|
|:----------------|:--------------------------------------------------------------------------------------------------|:------|
|`-h``--help` |显示帮助 ||
|`-i``--id` |要下载的文件id(或网页地址) ||
|`-o``--output` |文件保存名,默认是`book118.pdf` ||
|`-u``--url` |要下载的文件的网页地址 ||
|`-o``--output` |文件保存名,默认是文档的标题.pdf ||
|`-p``--proxy` |设置要使用的代理地址(默认使用环境变量中`HTTP_PROXY``HTTPS_PROXY`设置的值),可以使用`-p ''`强制设置不走代理 ||
|`-f``--force` |强制重新下载,不使用缓存 ||
|`-t``--thread` |要使用的线程数 ||
|`-t``--thread` |要使用的线程数,如不指定默认是10 ||
|`-s``--safe` |如果被服务器拒绝可以打开此选项,将强制单线程,并增加请求和下载的间隔时间 ||

## 使用模块

Expand All @@ -31,7 +32,7 @@ python3 -m pip install documentDownloader

安装完成后即可直接使用 `documentDownloader` 命令

如:`documentDownloader -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20`
如:`documentDownloader -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20`

### 直接使用源码中的 main.py

Expand All @@ -41,7 +42,7 @@ python3 -m pip install documentDownloader
2. 安装依赖模块(Pillow、reportlab、requests) `python -m pip install -r requirements.txt`
3. 使用 `python3 main.py` 执行

如:`python main.py -i https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019.pdf' -p http://127.0.0.1:1080 -f -t 20`
如:`python main.py -u https://max.book118.com/html/2020/0109/5301014320002213.shtm -o '单身人群专题研究报告-2019' -p http://127.0.0.1:1080 -f -t 20`

**仅供学习爬虫及相关知识,请支持正版图书**
*虽然book118上的好多pdf也是盗版吧*
Expand All @@ -55,4 +56,5 @@ python3 -m pip install documentDownloader

- 2019-01-29: Book118网站更新,更改对应部分代码. [@JodeZer](https://github.com/JodeZer)
- 2020-01-09: 重构代码,增加多线程下载加速,允许使用代理,允许通过已有缓存直接建立pdf,自动识别图片大小生成pdf [@OhYee](https://github.com/OhYee)
- 2020-05-25: 发布到 PyPI
- 2020-05-25: 发布到 PyPI
- 2021-10-18: Book118网站更新,更改部分代码; 设置默认导出pdf的文件名为文档标题; 对无法免费预览全文的文档增加提示; 调整请求间隔为2秒(实测请求间隔小于2秒很可能会返回空地址); 增加"慢速下载"选项,防止下载过快被服务器拒绝。[@alxt17](https://github.com/alxt17)
39 changes: 20 additions & 19 deletions book118/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,27 @@


def solve_argv(argv):
help_text = '''python main.py -i <document id>
help_text = '''python main.py -u <document url>
-h --help show help
-i --id document id (required)
-u --url document url (required)
-o --output output file (default `book118.pdf`)
-p --proxy proxy url (default using `http_proxy` and `https_proxy`)
-f --force force re-download images
-t --thread thread number for downloading images
-s --safe limit download request in case server refuses
'''

document_id = None
document_url = None
http_proxy = os.environ.get("HTTP_PROXY")
https_proxy = os.environ.get("HTTPS_PROXY")
force_redownload = False
output_file = "./book118.pdf"
output_file = None
thread_number = 10
safe_download = False

try:
opts, args = getopt.getopt(
argv, "hi:o:p:ft:", ["help", "id=", "proxy=", "output=", "force", "thread="])
argv, "hu:o:p:ft:s", ["help", "url=", "proxy=", "output=", "force", "thread=", "safe"])
except getopt.GetoptError:
print(help_text)
sys.exit(1)
Expand All @@ -34,12 +36,8 @@ def solve_argv(argv):
if opt in ('-h', "--help"):
print(help_text)
exit(0)
elif opt in ("-i", "--id"):
try:
document_id = int(arg)
except:
document_id = int(re.findall(
r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", arg)[0])
elif opt in ("-u", "--url"):
document_url = arg
elif opt in ("-o", "--output"):
output_file = arg
elif opt in ("-p", "--proxy"):
Expand All @@ -51,17 +49,20 @@ def solve_argv(argv):
thread_number = int(arg)
except:
pass
if document_id == None:
elif opt in ("-s", "--safe"):
safe_download = True
if document_url == None:
print(help_text)
exit(1)
return (document_id, http_proxy, https_proxy,
force_redownload, output_file, thread_number)
return (document_url, http_proxy, https_proxy,
force_redownload, output_file, thread_number, safe_download)


def main():
(document_id, http_proxy, https_proxy,
force_redownload, output_file, thread_number) = solve_argv(sys.argv[1:])
(document_url, http_proxy, https_proxy,
force_redownload, output_file, thread_number, safe_download) = solve_argv(sys.argv[1:])
set_proxy(http_proxy, https_proxy)

document_download(document_id, force_redownload,
output_file, thread_number)
if safe_download:
thread_number = 1
document_download(document_url, force_redownload,
output_file, thread_number, safe_download)
155 changes: 82 additions & 73 deletions book118/book118.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,110 @@
import json
import sys
import os
import re
import threading
from .thread import Thread
from .pdf import build_pdf
from .request import get_page
import time


def get_preview_page_url(document_id: int or str) -> str:
def get_next_pages(info: dict, page: int):
'''
获取Book118预览页地址
根据当前图片获取下六张图片地址
'''
return get_page("https://max.book118.com/index.php", {
"g": "Home",
"m": "View",
"a": "ViewUrl",
"cid": document_id,
"flag": 1,
}).text


def get_next_page(info: object, img: str):
'''
根据当前图片获取下一张图片地址
'''
result = get_page('https://' + info["domain"] + '.book118.com/PW/GetPage/?', {
'f': info['Url'],
'img': img,
'isMobile': 'false',
'isNet': 'True',
'readLimit': info['ReadLimit'],
'furl': info['Furl']
}).json()
result_text = get_page("https://openapi.book118.com/getPreview.html?", {
"project_id": 1,
"aid": info["aid"],
"view_token": info["view_token"],
"page": str(page),
"callback": "jQuery", # 这个参数其实可有可无,名字也无所谓
"_": int(round(time.time() * 1000)) # 时间戳
}).text[7:-2] # 截掉"jQuery("和末尾的");",得到包含预览图片地址的json
result = json.loads(result_text)
return result


def get_page_info(document_id: int or str) -> object:
def get_page_info(document_url: str) -> dict:
'''
获取文档信息
'''
preview_url = get_preview_page_url(document_id)
preview_page = get_page("https:" + preview_url).text

info = {
"domain": re.findall(r'//(.*?)\..*', preview_url)[0]
}
forms = re.findall(
r'<input type="hidden" id="(.*?)" value="(.*?)".*?/>', preview_page)
for form in forms:
info[form[0]] = form[1]
preview_webpage = get_page(document_url).text

info = {"aid": re.findall(r'\d+', re.findall(r'office: {.*\n.*aid: \d+', preview_webpage)[0]),
# 获得office: {之后的aid,为解密后的id
"view_token": re.findall(r' view_token: \'.+\'', preview_webpage)[0][14:-1],
"title": re.findall(r' title: \'.+\'', preview_webpage)[0][9:-1],
"preview_page": re.findall(r' preview_page: \d+', preview_webpage)[0][15:], # 可预览的页数
"actual_page": re.findall(r' actual_page: \d+', preview_webpage)[0][14:] # 实际文章页数
}
if info["preview_page"] != info["actual_page"]:
print(
"attention! only {} of {} page(s) are free to preview(download)\n".format(info["preview_page"],
info["actual_page"]))
return info


def get_image_list(document_id: int or str, info: object) -> list:
def get_image_url_list(info: dict, safe_download: bool) -> list:
'''
获取文档预览图片地址列表
'''
img_list = []
while 1:
json_data = get_next_page(info, ""if len(
img_list) == 0 else img_list[-1])
print("Get image url {}/{} {}".format(
json_data["PageIndex"], json_data["PageCount"], json_data["NextPage"]
))
img_list.append(json_data["NextPage"])
if json_data["PageCount"] <= json_data["PageIndex"]:
break
return img_list


def document_download(document_id: int or str, force_redownload: bool,
output_file: str, thread_number: int):
domain = ""
img_list = []
current_page = 1
retry = 5
img_url_list = []
while current_page <= int(info["preview_page"]):
json_data = get_next_pages(info, current_page)
for value in json_data["data"].values():
if value == '':
if retry == 0:
print("Cannot get correct response, too many fails.")
sys.exit(1)
print("Empty response, retrying {}. Consider using -s(--safe) to limit request frequency".format(retry))
time.sleep(1)
retry -= 1
break
print("Getting image url {}/{}".format(current_page, info["preview_page"]))
img_url_list.append("https:" + value)
current_page += 1
retry = 5
if safe_download:
time.sleep(3)
else:
time.sleep(2)
return img_url_list


def document_download(document_url: str, force_redownload: bool,
output_file: str, thread_number: int, safe_download: bool):
document_id = int(re.findall(
r"https://max.book118.com/html/\d+/\d+/(\d+).shtm", document_url)[0])
temp_dir = "./temp/{}".format(document_id)
temp_file = "{}/{}".format(temp_dir, "img_list")

if force_redownload or not os.path.exists(temp_file):
info = get_page_info(document_id)
domain = info["domain"]
img_list = get_image_list(document_id, info)
info = get_page_info(document_url)
if output_file is None:
output_file = info["title"]
img_url_list = get_image_url_list(info, safe_download)

if not os.path.exists("./temp/"):
os.mkdir("./temp/")
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
with open(temp_file, 'w') as f:
f.write(info["domain"]+"\n")
[f.write(img+"\n") for img in img_list]
f.write(info["title"] + "\n")
[f.write(img_url + "\n") for img_url in img_url_list]
else:
with open(temp_file, 'r') as f:
img_list = f.read().split("\n")
domain = img_list[0]
img_list = img_list[1:]
img_list = list(filter(lambda x: len(x) > 0, img_list))

download_list = img_list if force_redownload else list(filter(
lambda x: not os.path.exists("{}/{}.jpg".format(temp_dir, x)),
img_list
img_url_list = f.read().split("\n")
if output_file is None:
output_file = img_url_list[0]
img_url_list = img_url_list[1:]
img_url_list = list(filter(lambda x: len(x) > 0, img_url_list))

download_list = img_url_list if force_redownload else list(filter(
lambda x: not os.path.exists("{}/{}".format(temp_dir, x[x.rfind('/') + 1:])),
img_url_list
))

print(download_list)
Expand All @@ -116,11 +124,12 @@ def task_pool() -> str:
lock.release()
return img

def download_image(thread_id: int, img: str):
with open("{}/{}.jpg".format(temp_dir, img), "wb") as f:
print("Thread", thread_id, "download image", img)
f.write(get_page('http://' + domain +
'.book118.com/img/?', {'img': img}).content)
def download_image(thread_id: int, img_url: str):
with open("{}/{}".format(temp_dir, img_url[img_url.rfind('/') + 1:]), "wb") as f:
print("Thread", thread_id, "download image", img_url)
f.write(get_page(img_url).content)
if safe_download:
time.sleep(1)

threads = [Thread(i, download_image, task_pool)
for i in range(thread_number)]
Expand All @@ -130,8 +139,8 @@ def download_image(thread_id: int, img: str):
thread.join()
del threads

print("Downloading images finished. Generate PDF file ", output_file)
print("Downloading images finished. Generate PDF file ", output_file + '.pdf')

build_pdf(temp_dir, img_list, output_file)
build_pdf(temp_dir, img_url_list, output_file)

print("Generating PDF file finished. File name ", output_file)
print("Generating PDF file finished. File name ", output_file + '.pdf')
10 changes: 7 additions & 3 deletions book118/pdf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from PIL import Image
from reportlab.pdfgen import canvas

import os
import sys

def build_pdf(temp_dir: str, img_list: list, output_file: str):
pdf = canvas.Canvas(output_file)
pdf = canvas.Canvas('./' + output_file + '.pdf')
for img in img_list:
file_path = "{}/{}.jpg".format(temp_dir, img)
file_path = "{}/{}".format(temp_dir, img[img.rfind('/') + 1:])
if not os.path.exists(file_path):
print("Some images missing, please use -f(--force) to force re-download")
sys.exit(1)
w, h = Image.open(file_path).size
pdf.setPageSize((w, h))
pdf.drawImage(file_path, 0, 0, w, h)
Expand Down

0 comments on commit 2d7ec41

Please sign in to comment.