diff options
| -rw-r--r-- | crawlers/youtube_channels.py | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/crawlers/youtube_channels.py b/crawlers/youtube_channels.py new file mode 100644 index 0000000..806bccb --- /dev/null +++ b/crawlers/youtube_channels.py @@ -0,0 +1,163 @@ +# -*- encoding: utf-8 -*- + +""" +Extract list of videos from Youtube channels and store information in JSON +files. Video URLs are replaced by their Hooktube equivalents. + +Usage: + python youtube_channels.py <CONFIG_FILE> + +The configuration file must use the JSON format and contain the following keys: + - key: A Google API key which authorizes the application to query the Youtube + webservices. + - storage: A path to a directory (created if it does not exist) where the + results will be stored. + - channels: A dictionnary whose keys are labels that can be chosen arbitrarily + and values are the channel identifiers that appear in the URL on + Youtube when visiting the home page of the channel. + +@author: <sylvain.herledan@hrafnagud.info> +@date: 2018-06-19 +""" + +import http.client +import logging +import json +import sys +import os + +logger = logging.getLogger(__name__) + + +class MissingConfigFile(Exception): + """Raised when the config file does not exist.""" + pass + + +class MissingChannels(Exception): + """Raised when the config file does not provide a list of channels.""" + pass + + +class MissingStorage(Exception): + """Raised when the config file does not provide an output storage.""" + pass + + +class MissingAPIKey(Exception): + """Raised when the config file does not contain a Youtube API key.""" + pass + + +def get_videos_from_channel(conn, api_key, channel_id, results, init=False): + """""" + base_video_url = 'https://hooktube.com/watch?v=' + search_url = f'/youtube/v3/search?maxResults=25&part=snippet,id&order=date' + first_url = f'{search_url}&key={api_key}&channelId={channel_id}' + url = first_url + + while True: + conn.request('GET', url) + response = conn.getresponse() + result = response.read() + json_result = json.loads(result.decode('utf-8')) + for i in json_result['items']: + if i['id']['kind'] == "youtube#video": + video_id = i['id']['videoId'] + info = {'title': i['snippet']['title'], + 'description': i['snippet']['description'], + 'url': f'https://hooktube.com/watch?v={video_id}' + } + pubdate = i['snippet']['publishedAt'] + results[pubdate] = info + + if init is True: + try: + next_page_token = json_result['nextPageToken'] + url = first_url + '&pageToken={}'.format(next_page_token) + except: + break + else: + break + + +def load_config(cfg_path): + """Load configuration from JSON file and check that it contains a channels + list, an output storage and a Youtube API key.""" + + if not os.path.exists(cfg_path): + raise MissingConfigFile + + with open(cfg_path) as f: + cfg = json.load(f) + + if 'channels' not in cfg: + raise MissingChannels + + if 'storage' not in cfg: + raise MissingStorage + + if 'key' not in cfg: + raise MissingAPIKey + + return cfg + + +def crawl_youtube_channels(channels, api_key, output_dir): + """""" + conn = http.client.HTTPSConnection('www.googleapis.com') + + for channel_name, channel_id in channels.items(): + channel_path = os.path.join(output_dir, f'{channel_name}.json') + init = True + results = {} + if os.path.exists(channel_path): + with open(channel_path, encoding='utf-8') as f: + init = False + results = json.load(f) + + get_videos_from_channel(conn, api_key, channel_id, results, init) + + with open(channel_path, 'w', encoding='utf-8') as f: + _results = {k: results[k] for k in sorted(results)} + json.dump(_results, f, indent=2) + + conn.close() + conn.auto_open = 0 + + +if '__main__' == __name__: + + # Setup logging + main_logger = logging.getLogger() + main_logger.handlers = [] + handler = logging.StreamHandler() + handler.setLevel(logging.DEBUG) + main_logger.addHandler(handler) + main_logger.setLevel(logging.WARN) + + cfg_path = sys.argv[1] + + try: + cfg = load_config(cfg_path) + except MissingConfigFile: + logger.error(f'Config file "{cfg_path}" not found') + sys.exit(1) + except MissingChannels: + logger.error('Config file does not contain any channels list.') + sys.exit(1) + except MissingStorage: + logger.error('Config file does not contain any output storage.') + sys.exit(1) + except MissingAPIKey: + logger.error('Config file does not contain any Youtube API key.') + sys.exit(1) + + output_dir = cfg['storage'] + if not os.path.isdir(output_dir): + os.makedirs(output_dir, exist_ok=True) + + api_key = cfg['key'] + channels = cfg['channels'] + crawl_youtube_channels(channels, api_key, output_dir) + |
