-
按照教程Part 1: Using YouTube’s Python API for Data Science申请YouTube API Key
-
按照教程Where to download your_client_secret_File.json file下载your_client_secret_File.json file
YouTube提供了多种视频爬取方式,以其中两种为例:一种是通过search query;一种是通过videoId。
youtube_videos.py函数定义如下:
# -*- coding: utf-8 -*-
# Sample Python code for youtube.channels.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
## please see the link: https://developers.google.com/youtube/v3/docs to get more ways to use this API
## search query
# API only allow maxResults in range [0, 50]
request = youtube.search().list(
q='cat',
type="video",
part="id,snippet",
maxResults=50
)
response = request.execute()
print(response)
if __name__ == "__main__":
main()
当运行成功时,response返回结果格式如下:
{
"kind": "youtube#searchListResponse",
"etag": etag,
"nextPageToken": string,
"prevPageToken": string,
"regionCode": string,
"pageInfo": {
"totalResults": integer,
"resultsPerPage": integer
},
"items": [
search Resource
]
}
其中,search Resource的格式如下:
{
"kind": "youtube#searchResult",
"etag": etag,
"id": {
"kind": string,
"videoId": string,
"channelId": string,
"playlistId": string
},
"snippet": {
"publishedAt": datetime,
"channelId": string,
"title": string,
"description": string,
"thumbnails": {
(key): {
"url": string,
"width": unsigned integer,
"height": unsigned integer
}
},
"channelTitle": string,
"liveBroadcastContent": string
}
}
youtube_videos.py函数定义如下:
# -*- coding: utf-8 -*-
# Sample Python code for youtube.channels.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
## please see the link: https://developers.google.com/youtube/v3/docs to get more ways to use this API
## videoId
# API only allow maxResults in range [0, 50]
request = youtube.videos().list(
part="snippet, contentDetails, recordingDetails, localizations, statistics",
id="Ks-_Mh1QhMc,c0KYU2j0TM4,eIho2S0ZahI"
)
response = request.execute()
print(response)
if __name__ == "__main__":
main()
当运行成功时,response返回结果格式如下:
{
"kind": "youtube#videoListResponse",
"etag": etag,
"nextPageToken": string,
"prevPageToken": string,
"pageInfo": {
"totalResults": integer,
"resultsPerPage": integer
},
"items": [
video Resource
]
}
其中,video Resource的格式如下:
{
"kind": "youtube#video",
"etag": etag,
"id": string,
"snippet": {
"publishedAt": datetime,
"channelId": string,
"title": string,
"description": string,
"thumbnails": {
(key): {
"url": string,
"width": unsigned integer,
"height": unsigned integer
}
},
"channelTitle": string,
"tags": [
string
],
"categoryId": string,
"liveBroadcastContent": string,
"defaultLanguage": string,
"localized": {
"title": string,
"description": string
},
"defaultAudioLanguage": string
},
"contentDetails": {
"duration": string,
"dimension": string,
"definition": string,
"caption": string,
"licensedContent": boolean,
"regionRestriction": {
"allowed": [
string
],
"blocked": [
string
]
},
"contentRating": {
"acbRating": string,
"agcomRating": string,
"anatelRating": string,
"bbfcRating": string,
"bfvcRating": string,
"bmukkRating": string,
"catvRating": string,
"catvfrRating": string,
"cbfcRating": string,
"cccRating": string,
"cceRating": string,
"chfilmRating": string,
"chvrsRating": string,
"cicfRating": string,
"cnaRating": string,
"cncRating": string,
"csaRating": string,
"cscfRating": string,
"czfilmRating": string,
"djctqRating": string,
"djctqRatingReasons": [,
string
],
"ecbmctRating": string,
"eefilmRating": string,
"egfilmRating": string,
"eirinRating": string,
"fcbmRating": string,
"fcoRating": string,
"fmocRating": string,
"fpbRating": string,
"fpbRatingReasons": [,
string
],
"fskRating": string,
"grfilmRating": string,
"icaaRating": string,
"ifcoRating": string,
"ilfilmRating": string,
"incaaRating": string,
"kfcbRating": string,
"kijkwijzerRating": string,
"kmrbRating": string,
"lsfRating": string,
"mccaaRating": string,
"mccypRating": string,
"mcstRating": string,
"mdaRating": string,
"medietilsynetRating": string,
"mekuRating": string,
"mibacRating": string,
"mocRating": string,
"moctwRating": string,
"mpaaRating": string,
"mpaatRating": string,
"mtrcbRating": string,
"nbcRating": string,
"nbcplRating": string,
"nfrcRating": string,
"nfvcbRating": string,
"nkclvRating": string,
"oflcRating": string,
"pefilmRating": string,
"rcnofRating": string,
"resorteviolenciaRating": string,
"rtcRating": string,
"rteRating": string,
"russiaRating": string,
"skfilmRating": string,
"smaisRating": string,
"smsaRating": string,
"tvpgRating": string,
"ytRating": string
},
"projection": string,
"hasCustomThumbnail": boolean
},
"status": {
"uploadStatus": string,
"failureReason": string,
"rejectionReason": string,
"privacyStatus": string,
"publishAt": datetime,
"license": string,
"embeddable": boolean,
"publicStatsViewable": boolean,
"madeForKids": boolean,
"selfDeclaredMadeForKids": boolean
},
"statistics": {
"viewCount": unsigned long,
"likeCount": unsigned long,
"dislikeCount": unsigned long,
"favoriteCount": unsigned long,
"commentCount": unsigned long
},
"player": {
"embedHtml": string,
"embedHeight": long,
"embedWidth": long
},
"topicDetails": {
"topicIds": [
string
],
"relevantTopicIds": [
string
],
"topicCategories": [
string
]
},
"recordingDetails": {
"recordingDate": datetime
},
"fileDetails": {
"fileName": string,
"fileSize": unsigned long,
"fileType": string,
"container": string,
"videoStreams": [
{
"widthPixels": unsigned integer,
"heightPixels": unsigned integer,
"frameRateFps": double,
"aspectRatio": double,
"codec": string,
"bitrateBps": unsigned long,
"rotation": string,
"vendor": string
}
],
"audioStreams": [
{
"channelCount": unsigned integer,
"codec": string,
"bitrateBps": unsigned long,
"vendor": string
}
],
"durationMs": unsigned long,
"bitrateBps": unsigned long,
"creationTime": string
},
"processingDetails": {
"processingStatus": string,
"processingProgress": {
"partsTotal": unsigned long,
"partsProcessed": unsigned long,
"timeLeftMs": unsigned long
},
"processingFailureReason": string,
"fileDetailsAvailability": string,
"processingIssuesAvailability": string,
"tagSuggestionsAvailability": string,
"editorSuggestionsAvailability": string,
"thumbnailsAvailability": string
},
"suggestions": {
"processingErrors": [
string
],
"processingWarnings": [
string
],
"processingHints": [
string
],
"tagSuggestions": [
{
"tag": string,
"categoryRestricts": [
string
]
}
],
"editorSuggestions": [
string
]
},
"liveStreamingDetails": {
"actualStartTime": datetime,
"actualEndTime": datetime,
"scheduledStartTime": datetime,
"scheduledEndTime": datetime,
"concurrentViewers": unsigned long,
"activeLiveChatId": string
},
"localizations": {
(key): {
"title": string,
"description": string
}
}
}
根据resource的json结果,可以提取其中的信息,从而进行下一步统计。
for search_result in search_response.get("items", []):
if search_result["id"]["kind"] == "youtube#video":
videos.append(search_result)
由于官方youtube查询时最多只能返回50个查询结果,每次查询时返回结果几乎一样,另外,youttube需要鼠标滑动到页面底端才会有刷新,新一批的视频才会召回,而此时url并没有变化,静态爬虫无法满足。因此,为了能够爬取类似网页的信息,可以采用selenium来模拟鼠标操作浏览器,实现动态爬虫。
#! /usr/bin/env python3
# author: Qi Shao
########### load packages ############
from selenium import webdriver
import time
from bs4 import BeautifulSoup
########### 打开Chrome浏览器 ############
# chromedriver下载地址: http://npm.taobao.org/mirrors/chromedriver/
driver = webdriver.Chrome(executable_path="/home/sensetime/Desktop/code/anet_dataset/chromedriver")
driver.get("https://www.youtube.com/")
########### 窗口最大化 ############
driver.maximize_window()
time.sleep(1)
driver.refresh()
########### 获取cookie ############
cookie = driver.get_cookies()
########### 查询query ############
for query in ['cat', 'dog']:
########### 查询query,限制video时长在4分钟以内 ############
url = 'https://www.youtube.com/results?search_query=' + query + '&sp=EgQQARgB'
driver.get(url)
print(query)
def execute_times(times):
for i in range(times + 1):
########### 解析html ############
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
zzr = soup.find_all('a', id="thumbnail")
########### 获取video_id ############
for item in zzr:
video = item.get("href")
if video is not None and "/watch?v=" in video:
video_id = video.replace('/watch?v=', '')
print(video_id)
########### 模拟鼠标向下滑动 ############
js = "var q=document.documentElement.scrollTop=100000000000"
driver.execute_script(js)
time.sleep(3) # 等待页面刷新
########### 模拟鼠标向下滑动3次 ############
execute_times(3)
time.sleep(1)
########### 退出Chrome ############
driver.quit()
参考:https://www.zhihu.com/question/46528604
selenium参考:https://selenium-python.readthedocs.io/installation.html
Youtube视频下载可以使用youtube-dl工具,具体参考:https://github.com/ytdl-org/youtube-dl ,使用方法可以参考: https://zhuanlan.zhihu.com/p/27718783 ,批量生成下载命令可以参考:https://github.com/activitynet/ActivityNet/blob/master/Crawler/run_crosscheck.py