diff options
Diffstat (limited to 'crawlers/france_inter/rss.py')
| -rw-r--r-- | crawlers/france_inter/rss.py | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/crawlers/france_inter/rss.py b/crawlers/france_inter/rss.py new file mode 100644 index 0000000..48509d0 --- /dev/null +++ b/crawlers/france_inter/rss.py @@ -0,0 +1,78 @@ +# -*- encoding: utf-8 -*- + +""" +@author: <sylvain.herledan@hrafnagud.info> +@date: 2017-06-07 +""" + +import os +import logging +import argparse +import smhasher +import feedparser +from crawlers import db + +logger = logging.getLogger() +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) +logger.setLevel(logging.WARN) + + +def parse_args(): + """""" + parser = argparse.ArgumentParser() + parser.add_argument('--debug', action='store_true', default=False) + parser.add_argument('--db_path', type=str, required=False) + + args = parser.parse_args() + return args + + +def parse_feed(config, emission, feed_url): + """""" + r = feedparser.parse(feed_url) + + for entry in r.entries: + e_url = entry.id + e_hash = smhasher.murmur3_x64_128(e_url) + + # Check database to avoid reprocessing + already_processed = db.has_hash(config, e_hash) + if already_processed: + continue + + e_source = 'France Inter: {}'.format(emission) + basename = os.path.basename(entry.id) + _, d, _ = basename.split('-', 2) + date_elems = d.split('.') + e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2], + date_elems[1], + date_elems[0]) + + # Save in database + e = {'title': entry.title, + 'source': e_source, + 'url': e_url, + 'hash': e_hash, + 'datetime': e_datetime} + db.store_hash(config, e) + + +if '__main__' == __name__: + args = parse_args() + if args.debug is True: + logger.setLevel(logging.DEBUG) + + config = {'db_path': '/tmp'} + if args.db_path and os.path.exists(args.db_path): + config['db_path'] = args.db_path + + base_url = 'http://radiofrance-podcast.net' + feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml', + 'Sur les épaules de Darwin': 'podcast09/rss_11549.xml', + 'La preuve par Z': 'podcast09/rss_14430.xml'} + + for emission in feeds: + feed_url = '{}/{}'.format(base_url, feeds[emission]) + parse_feed(config, emission, feed_url) |
