1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
# -*- encoding: utf-8 -*-
"""
@author: <sylvain.herledan@hrafnagud.info>
@date: 2017-06-07
"""
import os
import logging
import argparse
import smhasher
import feedparser
from crawlers import db
logger = logging.getLogger()
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
logger.setLevel(logging.WARN)
def parse_args():
""""""
parser = argparse.ArgumentParser()
parser.add_argument('--debug', action='store_true', default=False)
parser.add_argument('--db_path', type=str, required=False)
args = parser.parse_args()
return args
def parse_feed(config, emission, feed_url):
""""""
r = feedparser.parse(feed_url)
for entry in r.entries:
e_url = entry.id
e_hash = smhasher.murmur3_x64_128(e_url)
# Check database to avoid reprocessing
already_processed = db.has_hash(config, e_hash)
if already_processed:
continue
e_source = 'France Inter: {}'.format(emission)
basename = os.path.basename(entry.id)
_, d, _ = basename.split('-', 2)
date_elems = d.split('.')
e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2],
date_elems[1],
date_elems[0])
# Save in database
e = {'title': entry.title,
'source': e_source,
'url': e_url,
'hash': e_hash,
'datetime': e_datetime}
db.store_hash(config, e)
if '__main__' == __name__:
args = parse_args()
if args.debug is True:
logger.setLevel(logging.DEBUG)
config = {'db_path': '/tmp'}
if args.db_path and os.path.exists(args.db_path):
config['db_path'] = args.db_path
base_url = 'http://radiofrance-podcast.net'
feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml',
'Sur les épaules de Darwin': 'podcast09/rss_11549.xml',
'La preuve par Z': 'podcast09/rss_14430.xml'}
for emission in feeds:
feed_url = '{}/{}'.format(base_url, feeds[emission])
parse_feed(config, emission, feed_url)
|