summaryrefslogtreecommitdiffstats
path: root/crawlers/france_inter/rss.py
blob: 48509d002d6bbbecd24c6896fd8632f83eba58fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- encoding: utf-8 -*-

"""
@author: <sylvain.herledan@hrafnagud.info>
@date: 2017-06-07
"""

import os
import logging
import argparse
import smhasher
import feedparser
from crawlers import db

logger = logging.getLogger()
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
logger.setLevel(logging.WARN)


def parse_args():
    """"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true', default=False)
    parser.add_argument('--db_path', type=str, required=False)

    args = parser.parse_args()
    return args


def parse_feed(config, emission, feed_url):
    """"""
    r = feedparser.parse(feed_url)

    for entry in r.entries:
        e_url = entry.id
        e_hash = smhasher.murmur3_x64_128(e_url)

        # Check database to avoid reprocessing
        already_processed = db.has_hash(config, e_hash)
        if already_processed:
            continue

        e_source = 'France Inter: {}'.format(emission)
        basename = os.path.basename(entry.id)
        _, d, _ = basename.split('-', 2)
        date_elems = d.split('.')
        e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2],
                                                date_elems[1],
                                                date_elems[0])

        # Save in database
        e = {'title': entry.title,
             'source': e_source,
             'url': e_url,
             'hash': e_hash,
             'datetime': e_datetime}
        db.store_hash(config, e)


if '__main__' == __name__:
    args = parse_args()
    if args.debug is True:
        logger.setLevel(logging.DEBUG)

    config = {'db_path': '/tmp'}
    if args.db_path and os.path.exists(args.db_path):
        config['db_path'] = args.db_path

    base_url = 'http://radiofrance-podcast.net'
    feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml',
             'Sur les épaules de Darwin': 'podcast09/rss_11549.xml',
             'La preuve par Z': 'podcast09/rss_14430.xml'}

    for emission in feeds:
        feed_url = '{}/{}'.format(base_url, feeds[emission])
        parse_feed(config, emission, feed_url)