summaryrefslogtreecommitdiffstats
path: root/crawlers/france_inter/rss.py
diff options
context:
space:
mode:
Diffstat (limited to 'crawlers/france_inter/rss.py')
-rw-r--r--crawlers/france_inter/rss.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/crawlers/france_inter/rss.py b/crawlers/france_inter/rss.py
new file mode 100644
index 0000000..48509d0
--- /dev/null
+++ b/crawlers/france_inter/rss.py
@@ -0,0 +1,78 @@
+# -*- encoding: utf-8 -*-
+
+"""
+@author: <sylvain.herledan@hrafnagud.info>
+@date: 2017-06-07
+"""
+
+import os
+import logging
+import argparse
+import smhasher
+import feedparser
+from crawlers import db
+
+logger = logging.getLogger()
+handler = logging.StreamHandler()
+handler.setLevel(logging.DEBUG)
+logger.addHandler(handler)
+logger.setLevel(logging.WARN)
+
+
+def parse_args():
+ """"""
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--debug', action='store_true', default=False)
+ parser.add_argument('--db_path', type=str, required=False)
+
+ args = parser.parse_args()
+ return args
+
+
+def parse_feed(config, emission, feed_url):
+ """"""
+ r = feedparser.parse(feed_url)
+
+ for entry in r.entries:
+ e_url = entry.id
+ e_hash = smhasher.murmur3_x64_128(e_url)
+
+ # Check database to avoid reprocessing
+ already_processed = db.has_hash(config, e_hash)
+ if already_processed:
+ continue
+
+ e_source = 'France Inter: {}'.format(emission)
+ basename = os.path.basename(entry.id)
+ _, d, _ = basename.split('-', 2)
+ date_elems = d.split('.')
+ e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2],
+ date_elems[1],
+ date_elems[0])
+
+ # Save in database
+ e = {'title': entry.title,
+ 'source': e_source,
+ 'url': e_url,
+ 'hash': e_hash,
+ 'datetime': e_datetime}
+ db.store_hash(config, e)
+
+
+if '__main__' == __name__:
+ args = parse_args()
+ if args.debug is True:
+ logger.setLevel(logging.DEBUG)
+
+ config = {'db_path': '/tmp'}
+ if args.db_path and os.path.exists(args.db_path):
+ config['db_path'] = args.db_path
+
+ base_url = 'http://radiofrance-podcast.net'
+ feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml',
+ 'Sur les épaules de Darwin': 'podcast09/rss_11549.xml',
+ 'La preuve par Z': 'podcast09/rss_14430.xml'}
+
+ for emission in feeds:
+ feed_url = '{}/{}'.format(base_url, feeds[emission])
+ parse_feed(config, emission, feed_url)