summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crawlers/youtube_channels.py163
1 files changed, 163 insertions, 0 deletions
diff --git a/crawlers/youtube_channels.py b/crawlers/youtube_channels.py
new file mode 100644
index 0000000..806bccb
--- /dev/null
+++ b/crawlers/youtube_channels.py
@@ -0,0 +1,163 @@
+# -*- encoding: utf-8 -*-
+
+"""
+Extract list of videos from Youtube channels and store information in JSON
+files. Video URLs are replaced by their Hooktube equivalents.
+
+Usage:
+ python youtube_channels.py <CONFIG_FILE>
+
+The configuration file must use the JSON format and contain the following keys:
+ - key: A Google API key which authorizes the application to query the Youtube
+ webservices.
+ - storage: A path to a directory (created if it does not exist) where the
+ results will be stored.
+ - channels: A dictionnary whose keys are labels that can be chosen arbitrarily
+ and values are the channel identifiers that appear in the URL on
+ Youtube when visiting the home page of the channel.
+
+@author: <sylvain.herledan@hrafnagud.info>
+@date: 2018-06-19
+"""
+
+import http.client
+import logging
+import json
+import sys
+import os
+
+logger = logging.getLogger(__name__)
+
+
+class MissingConfigFile(Exception):
+ """Raised when the config file does not exist."""
+ pass
+
+
+class MissingChannels(Exception):
+ """Raised when the config file does not provide a list of channels."""
+ pass
+
+
+class MissingStorage(Exception):
+ """Raised when the config file does not provide an output storage."""
+ pass
+
+
+class MissingAPIKey(Exception):
+ """Raised when the config file does not contain a Youtube API key."""
+ pass
+
+
+def get_videos_from_channel(conn, api_key, channel_id, results, init=False):
+ """"""
+ base_video_url = 'https://hooktube.com/watch?v='
+ search_url = f'/youtube/v3/search?maxResults=25&part=snippet,id&order=date'
+ first_url = f'{search_url}&key={api_key}&channelId={channel_id}'
+ url = first_url
+
+ while True:
+ conn.request('GET', url)
+ response = conn.getresponse()
+ result = response.read()
+ json_result = json.loads(result.decode('utf-8'))
+ for i in json_result['items']:
+ if i['id']['kind'] == "youtube#video":
+ video_id = i['id']['videoId']
+ info = {'title': i['snippet']['title'],
+ 'description': i['snippet']['description'],
+ 'url': f'https://hooktube.com/watch?v={video_id}'
+ }
+ pubdate = i['snippet']['publishedAt']
+ results[pubdate] = info
+
+ if init is True:
+ try:
+ next_page_token = json_result['nextPageToken']
+ url = first_url + '&pageToken={}'.format(next_page_token)
+ except:
+ break
+ else:
+ break
+
+
+def load_config(cfg_path):
+ """Load configuration from JSON file and check that it contains a channels
+ list, an output storage and a Youtube API key."""
+
+ if not os.path.exists(cfg_path):
+ raise MissingConfigFile
+
+ with open(cfg_path) as f:
+ cfg = json.load(f)
+
+ if 'channels' not in cfg:
+ raise MissingChannels
+
+ if 'storage' not in cfg:
+ raise MissingStorage
+
+ if 'key' not in cfg:
+ raise MissingAPIKey
+
+ return cfg
+
+
+def crawl_youtube_channels(channels, api_key, output_dir):
+ """"""
+ conn = http.client.HTTPSConnection('www.googleapis.com')
+
+ for channel_name, channel_id in channels.items():
+ channel_path = os.path.join(output_dir, f'{channel_name}.json')
+ init = True
+ results = {}
+ if os.path.exists(channel_path):
+ with open(channel_path, encoding='utf-8') as f:
+ init = False
+ results = json.load(f)
+
+ get_videos_from_channel(conn, api_key, channel_id, results, init)
+
+ with open(channel_path, 'w', encoding='utf-8') as f:
+ _results = {k: results[k] for k in sorted(results)}
+ json.dump(_results, f, indent=2)
+
+ conn.close()
+ conn.auto_open = 0
+
+
+if '__main__' == __name__:
+
+ # Setup logging
+ main_logger = logging.getLogger()
+ main_logger.handlers = []
+ handler = logging.StreamHandler()
+ handler.setLevel(logging.DEBUG)
+ main_logger.addHandler(handler)
+ main_logger.setLevel(logging.WARN)
+
+ cfg_path = sys.argv[1]
+
+ try:
+ cfg = load_config(cfg_path)
+ except MissingConfigFile:
+ logger.error(f'Config file "{cfg_path}" not found')
+ sys.exit(1)
+ except MissingChannels:
+ logger.error('Config file does not contain any channels list.')
+ sys.exit(1)
+ except MissingStorage:
+ logger.error('Config file does not contain any output storage.')
+ sys.exit(1)
+ except MissingAPIKey:
+ logger.error('Config file does not contain any Youtube API key.')
+ sys.exit(1)
+
+ output_dir = cfg['storage']
+ if not os.path.isdir(output_dir):
+ os.makedirs(output_dir, exist_ok=True)
+
+ api_key = cfg['key']
+ channels = cfg['channels']
+ crawl_youtube_channels(channels, api_key, output_dir)
+