summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSylvain Herlédan <sylvain.herledan@hrafnagud.info>2017-06-07 01:19:57 +0200
committerSylvain Herlédan <sylvain.herledan@hrafnagud.info>2017-06-07 01:19:57 +0200
commit680280d7fdadd5a9f149506e1fb1db1eea562444 (patch)
treee0dfde57220d263abdb1ef05afb5645fe3026035
downloadcrawlers-680280d7fdadd5a9f149506e1fb1db1eea562444.tar.gz
crawlers-680280d7fdadd5a9f149506e1fb1db1eea562444.tar.bz2
crawlers-680280d7fdadd5a9f149506e1fb1db1eea562444.zip
Initial commit with France Inter crawling scripts.
-rw-r--r--crawlers/db.py29
-rw-r--r--crawlers/france_inter/rss.py78
-rw-r--r--crawlers/france_inter/www.py139
-rwxr-xr-xsetup.sh33
4 files changed, 279 insertions, 0 deletions
diff --git a/crawlers/db.py b/crawlers/db.py
new file mode 100644
index 0000000..1b70d84
--- /dev/null
+++ b/crawlers/db.py
@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+
+"""
+@author: <sylvain.herledan@hrafnagud.info>
+@date: 2017-06-07
+"""
+
+import os
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def has_hash(config, key):
+ """"""
+ db_path = config.get('db_path', '/tmp')
+ filename = '{}.dlz'.format(key)
+ filepath = os.path.join(db_path, filename)
+ return os.path.exists(filepath)
+
+
+def store_hash(config, item):
+ """"""
+ db_path = config.get('db_path', '/tmp')
+ filename = '{}.dlz'.format(item['hash'])
+ filepath = os.path.join(db_path, filename)
+ with open(filepath, 'w') as f:
+ json.dump(item, f, indent=2, ensure_ascii=False)
diff --git a/crawlers/france_inter/rss.py b/crawlers/france_inter/rss.py
new file mode 100644
index 0000000..48509d0
--- /dev/null
+++ b/crawlers/france_inter/rss.py
@@ -0,0 +1,78 @@
+# -*- encoding: utf-8 -*-
+
+"""
+@author: <sylvain.herledan@hrafnagud.info>
+@date: 2017-06-07
+"""
+
+import os
+import logging
+import argparse
+import smhasher
+import feedparser
+from crawlers import db
+
+logger = logging.getLogger()
+handler = logging.StreamHandler()
+handler.setLevel(logging.DEBUG)
+logger.addHandler(handler)
+logger.setLevel(logging.WARN)
+
+
+def parse_args():
+ """"""
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--debug', action='store_true', default=False)
+ parser.add_argument('--db_path', type=str, required=False)
+
+ args = parser.parse_args()
+ return args
+
+
+def parse_feed(config, emission, feed_url):
+ """"""
+ r = feedparser.parse(feed_url)
+
+ for entry in r.entries:
+ e_url = entry.id
+ e_hash = smhasher.murmur3_x64_128(e_url)
+
+ # Check database to avoid reprocessing
+ already_processed = db.has_hash(config, e_hash)
+ if already_processed:
+ continue
+
+ e_source = 'France Inter: {}'.format(emission)
+ basename = os.path.basename(entry.id)
+ _, d, _ = basename.split('-', 2)
+ date_elems = d.split('.')
+ e_datetime = '{}-{}-{}T00:00:00'.format(date_elems[2],
+ date_elems[1],
+ date_elems[0])
+
+ # Save in database
+ e = {'title': entry.title,
+ 'source': e_source,
+ 'url': e_url,
+ 'hash': e_hash,
+ 'datetime': e_datetime}
+ db.store_hash(config, e)
+
+
+if '__main__' == __name__:
+ args = parse_args()
+ if args.debug is True:
+ logger.setLevel(logging.DEBUG)
+
+ config = {'db_path': '/tmp'}
+ if args.db_path and os.path.exists(args.db_path):
+ config['db_path'] = args.db_path
+
+ base_url = 'http://radiofrance-podcast.net'
+ feeds = {'Ça peut pas faire de mal': 'podcast09/rss_11262.xml',
+ 'Sur les épaules de Darwin': 'podcast09/rss_11549.xml',
+ 'La preuve par Z': 'podcast09/rss_14430.xml'}
+
+ for emission in feeds:
+ feed_url = '{}/{}'.format(base_url, feeds[emission])
+ parse_feed(config, emission, feed_url)
diff --git a/crawlers/france_inter/www.py b/crawlers/france_inter/www.py
new file mode 100644
index 0000000..4a75e9a
--- /dev/null
+++ b/crawlers/france_inter/www.py
@@ -0,0 +1,139 @@
+# -*- encoding: utf-8 -*-
+
+"""
+@author: <sylvain.herledan@hrafnagud.info>
+@date: 2017-06-06
+"""
+
+import os
+import sys
+import logging
+import argparse
+import smhasher
+import requests
+from html.parser import HTMLParser
+from crawlers import db
+
+logger = logging.getLogger()
+handler = logging.StreamHandler()
+handler.setLevel(logging.DEBUG)
+logger.addHandler(handler)
+logger.setLevel(logging.DEBUG)
+
+MNUMS = {'janvier': '01', 'fevrier': '02', 'mars': '03', 'avril': '04',
+ 'mai': '05', 'juin': '06', 'juillet': '07', 'aout': '08',
+ 'septembre': '09', 'octobre': '10', 'novembre': '11',
+ 'decembre': '12'}
+
+
+class FranceInterPageParser(HTMLParser):
+ def __init__(self, emission, *args, **kwargs):
+ """"""
+ self.entries = []
+ self.emission = emission
+ self.draft = None
+ super(FranceInterPageParser, self).__init__(*args, **kwargs)
+
+ def handle_starttag(self, tag, attributes):
+ """"""
+ if 'button' == tag:
+ e_url = None
+ e_title = None
+ e_datetime = None
+ for attr in attributes:
+ _attr = attr
+ if 'data-url' == _attr[0]:
+ e_url = '{}'.format(_attr[1])
+ elif 'data-diffusion-title' == _attr[0]:
+ e_title = '{}'.format(_attr[1])
+ elif 'data-diffusion-path' == _attr[0]:
+ elems = _attr[1].rsplit('-')
+ for offset in range(len(elems)):
+ year_idx = -1 * (1 + offset)
+ if elems[year_idx].isdigit():
+ try:
+ year = int(elems[year_idx])
+ if year in range(2000, 2050):
+ month = elems[year_idx - 1]
+ day = elems[year_idx - 2]
+ break
+ except ValueError:
+ continue
+
+ e_datetime = '{}-{}-{}T00:00:00'.format(year,
+ MNUMS[month],
+ day)
+
+ if None in (e_url, e_title, e_datetime):
+ return
+
+ e_source = 'France Inter: {}'.format(self.emission)
+ e_hash = smhasher.murmur3_x64_128(e_url)
+
+ self.draft = {'title': e_title,
+ 'source': e_source,
+ 'url': e_url,
+ 'hash': e_hash,
+ 'datetime': e_datetime}
+
+ def handle_endtag(self, tag):
+ """"""
+ if 'button' == tag and self.draft is not None:
+ self.entries.append(self.draft)
+ self.draft = None
+
+
+def parse_args():
+ """"""
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--debug', action='store_true', default=False)
+ parser.add_argument('--db_path', type=str, required=False)
+
+ args = parser.parse_args()
+ return args
+
+
+if '__main__' == __name__:
+ """"""
+ args = parse_args()
+ if args.debug is True:
+ logger.setLevel(logging.DEBUG)
+
+ config = {'db_path': '/tmp'}
+ if args.db_path and os.path.exists(args.db_path):
+ config['db_path'] = args.db_path
+
+ base_url = 'https://www.franceinter.fr'
+ urls = {'Ça peut pas faire de mal': 'emissions/ca-peut-pas-faire-de-mal',
+ 'Sur les épaules de Darwin': 'emissions/sur-les-epaules-de-darwin',
+ 'La preuve par Z': 'emissions/la-preuve-par-z'}
+
+ for emission in urls:
+ emission_url = '{}/{}'.format(base_url, urls[emission])
+ page_number = 1
+
+ while 0 < page_number:
+ page_url = '{}?p={}'.format(emission_url, page_number)
+ r = requests.get(page_url)
+ if not r.ok:
+ logger.error(r.text)
+ sys.exit(1)
+ page_content = r.text
+
+ parser = FranceInterPageParser(emission)
+ parser.feed(page_content)
+ parser.close()
+
+ if 0 < len(parser.entries):
+ # Save results
+ for entry in parser.entries:
+ already_processed = db.has_hash(config, entry['hash'])
+ if already_processed:
+ continue
+
+ db.store_hash(config, entry)
+
+ # Process next page
+ page_number = page_number + 1
+ else:
+ page_number = -1
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000..cf1d366
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# @author: <sylvain.herledan@hrafnagud.info>
+# @date: 2017-06-06
+
+set -o nounset
+set -o errexit
+set -o pipefail
+
+readonly PROGNAME="$(basename "${0}")";
+readonly PROGDIR="$(readlink -f "$(dirname "${0}")")";
+readonly ARGS="${@}";
+
+install_virtualenv()
+{
+ local install_dir="${1}"; shift;
+ local install_script_path="$(mktemp)";
+ local env_dir="${install_dir}/env";
+
+ wget 'https://bootstrap.pypa.io/get-pip.py' -O "${install_script_path}"
+ mkdir -p "${env_dir}"
+
+ PYTHONUSERBASE="${env_dir}" python "${install_script_path}" --user
+ rm -f "${install_script_path}"
+
+ PYTHONUSERBASE="${env_dir}" "${env_dir}/bin/pip" install --user virtualenv
+
+ local python_pkg_dir="$(find "${env_dir}/lib" -type d -name "*-packages" | head -1)";
+ PYTHONPATH="${python_pkg_dir}" env/bin/virtualenv env
+}
+
+install_virtualenv ${ARGS}
+exit 0