diff options
| -rw-r--r-- | crawlers/db.py | 29 | ||||
| -rw-r--r-- | crawlers/france_inter/rss.py | 78 | ||||
| -rw-r--r-- | crawlers/france_inter/www.py | 139 | ||||
| -rwxr-xr-x | setup.sh | 33 |
4 files changed, 279 insertions, 0 deletions
diff --git a/crawlers/db.py b/crawlers/db.py new file mode 100644 index 0000000..1b70d84 --- /dev/null +++ b/crawlers/db.py @@ -0,0 +1,29 @@ +# -*- encoding: utf-8 -*- + +""" +@author: <sylvain.herledan@hrafnagud.info> +@date: 2017-06-07 +""" + +import os +import json +import logging + +logger = logging.getLogger(__name__) + + +def has_hash(config, key): + """""" + db_path = config.get('db_path', '/tmp') + filename = '{}.dlz'.format(key) + filepath = os.path.join(db_path, filename) + return os.path.exists(filepath) + + +def store_hash(config, item): + """""" + db_path = config.get('db_path', '/tmp') + filename = '{}.dlz'.format(item['hash']) + filepath = os.path.join(db_path, filename) + with open(filepath, 'w') as f: + json.dump(item, f, indent=2, ensure_ascii=False) diff --git a/crawlers/france_inter/rss.py b/crawlers/france_inter/rss.py new file mode 100644 index 0000000..48509d0 --- /dev/null +++ b/crawlers/france_inter/rss.py @@ -0,0 +1,78 @@ +# -*- encoding: utf-8 -*- + +""" +@author: <sylvain.herledan@hrafnagud.info> +@date: 2017-06-07 +""" + +import os +import logging +import argparse +import smhasher +import feedparser +from crawlers import db + +logger = logging.getLogger() +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) +logger.setLevel(logging.WARN) + + +def parse_args(): + """""" + parser = argparse.ArgumentParser() + parser.add_argument('--debug', action='store_true', default=False) + parser.add_argument('--db_path', type=str, required=False) + + args = parser.parse_args() + return args + + +def parse_feed(config, emission, feed_url): + """""" + r = feedparser.parse(feed_url) + + for entry in r.entries: + e_url = entry.id + e_hash = smhasher.murmur3_x64_128(e_url) + + # Check database to avoid reprocessing + already_processed = db.has_hash(config, e_hash) + if already_processed: + continue + + e_source = 'France Inter: {}'.format(emission) + basename = os.path.basename(entry.id) + _, d, _ = basename.split('-', 2) + date_elems = d.split('.') + e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2], + date_elems[1], + date_elems[0]) + + # Save in database + e = {'title': entry.title, + 'source': e_source, + 'url': e_url, + 'hash': e_hash, + 'datetime': e_datetime} + db.store_hash(config, e) + + +if '__main__' == __name__: + args = parse_args() + if args.debug is True: + logger.setLevel(logging.DEBUG) + + config = {'db_path': '/tmp'} + if args.db_path and os.path.exists(args.db_path): + config['db_path'] = args.db_path + + base_url = 'http://radiofrance-podcast.net' + feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml', + 'Sur les épaules de Darwin': 'podcast09/rss_11549.xml', + 'La preuve par Z': 'podcast09/rss_14430.xml'} + + for emission in feeds: + feed_url = '{}/{}'.format(base_url, feeds[emission]) + parse_feed(config, emission, feed_url) diff --git a/crawlers/france_inter/www.py b/crawlers/france_inter/www.py new file mode 100644 index 0000000..4a75e9a --- /dev/null +++ b/crawlers/france_inter/www.py @@ -0,0 +1,139 @@ +# -*- encoding: utf-8 -*- + +""" +@author: <sylvain.herledan@hrafnagud.info> +@date: 2017-06-06 +""" + +import os +import sys +import logging +import argparse +import smhasher +import requests +from html.parser import HTMLParser +from crawlers import db + +logger = logging.getLogger() +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) +logger.setLevel(logging.DEBUG) + +MNUMS = {'janvier': '01', 'fevrier': '02', 'mars': '03', 'avril': '04', + 'mai': '05', 'juin': '06', 'juillet': '07', 'aout': '08', + 'septembre': '09', 'octobre': '10', 'novembre': '11', + 'decembre': '12'} + + +class FranceInterPageParser(HTMLParser): + def __init__(self, emission, *args, **kwargs): + """""" + self.entries = [] + self.emission = emission + self.draft = None + super(FranceInterPageParser, self).__init__(*args, **kwargs) + + def handle_starttag(self, tag, attributes): + """""" + if 'button' == tag: + e_url = None + e_title = None + e_datetime = None + for attr in attributes: + _attr = attr + if 'data-url' == _attr[0]: + e_url = '{}'.format(_attr[1]) + elif 'data-diffusion-title' == _attr[0]: + e_title = '{}'.format(_attr[1]) + elif 'data-diffusion-path' == _attr[0]: + elems = _attr[1].rsplit('-') + for offset in range(len(elems)): + year_idx = -1 * (1 + offset) + if elems[year_idx].isdigit(): + try: + year = int(elems[year_idx]) + if year in range(2000, 2050): + month = elems[year_idx - 1] + day = elems[year_idx - 2] + break + except ValueError: + continue + + e_datetime = '{}-{}-{}T00:00:00'.format(year, + MNUMS[month], + day) + + if None in (e_url, e_title, e_datetime): + return + + e_source = 'France Inter: {}'.format(self.emission) + e_hash = smhasher.murmur3_x64_128(e_url) + + self.draft = {'title': e_title, + 'source': e_source, + 'url': e_url, + 'hash': e_hash, + 'datetime': e_datetime} + + def handle_endtag(self, tag): + """""" + if 'button' == tag and self.draft is not None: + self.entries.append(self.draft) + self.draft = None + + +def parse_args(): + """""" + parser = argparse.ArgumentParser() + parser.add_argument('--debug', action='store_true', default=False) + parser.add_argument('--db_path', type=str, required=False) + + args = parser.parse_args() + return args + + +if '__main__' == __name__: + """""" + args = parse_args() + if args.debug is True: + logger.setLevel(logging.DEBUG) + + config = {'db_path': '/tmp'} + if args.db_path and os.path.exists(args.db_path): + config['db_path'] = args.db_path + + base_url = 'https://www.franceinter.fr' + urls = {'Ça peut pas faire de mal': 'emissions/ca-peut-pas-faire-de-mal', + 'Sur les épaules de Darwin': 'emissions/sur-les-epaules-de-darwin', + 'La preuve par Z': 'emissions/la-preuve-par-z'} + + for emission in urls: + emission_url = '{}/{}'.format(base_url, urls[emission]) + page_number = 1 + + while 0 < page_number: + page_url = '{}?p={}'.format(emission_url, page_number) + r = requests.get(page_url) + if not r.ok: + logger.error(r.text) + sys.exit(1) + page_content = r.text + + parser = FranceInterPageParser(emission) + parser.feed(page_content) + parser.close() + + if 0 < len(parser.entries): + # Save results + for entry in parser.entries: + already_processed = db.has_hash(config, entry['hash']) + if already_processed: + continue + + db.store_hash(config, entry) + + # Process next page + page_number = page_number + 1 + else: + page_number = -1 diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..cf1d366 --- /dev/null +++ b/setup.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +# @author: <sylvain.herledan@hrafnagud.info> +# @date: 2017-06-06 + +set -o nounset +set -o errexit +set -o pipefail + +readonly PROGNAME="$(basename "${0}")"; +readonly PROGDIR="$(readlink -f "$(dirname "${0}")")"; +readonly ARGS="${@}"; + +install_virtualenv() +{ + local install_dir="${1}"; shift; + local install_script_path="$(mktemp)"; + local env_dir="${install_dir}/env"; + + wget 'https://bootstrap.pypa.io/get-pip.py' -O "${install_script_path}" + mkdir -p "${env_dir}" + + PYTHONUSERBASE="${env_dir}" python "${install_script_path}" --user + rm -f "${install_script_path}" + + PYTHONUSERBASE="${env_dir}" "${env_dir}/bin/pip" install --user virtualenv + + local python_pkg_dir="$(find "${env_dir}/lib" -type d -name "*-packages" | head -1)"; + PYTHONPATH="${python_pkg_dir}" env/bin/virtualenv env +} + +install_virtualenv ${ARGS} +exit 0 |
