From 0584698995cc748434cddd4a1a3baa56ff7aa180 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 23:27:50 -0400 Subject: news source for The Hill --- unbiased/sources/base.py | 4 ++-- unbiased/sources/thehill.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 unbiased/sources/thehill.py diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 68e7f0d..b4bff75 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -174,8 +174,8 @@ class NewsSource(object): @classmethod def _get_author(cls, soup): - for author_tag in ['article:author', 'dc.creator', 'property="author']: - author = soup.find(author_tag) + for author_tag in ['article:author', 'dc.creator', 'author']: + author = soup.find('meta', property=author_tag) if author is None: continue return author['content'] diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py new file mode 100644 index 0000000..c678261 --- /dev/null +++ b/unbiased/sources/thehill.py @@ -0,0 +1,41 @@ +import urllib + +from unbiased.sources.base import NewsSource + +class TheHill(NewsSource): + + name = 'The Hill' + shortname = 'Hill' + url = 'http://thehill.com' + + bad_titles = ['THE MEMO'] + bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('h1', class_='top-story-headline')\ + .find('a')['href'] + h1 = urllib.parse.urljoin(cls.url, h1) + h1s = (h1,) + + h23s = soup.find('div', class_='section-top-content')\ + .find_all('div', class_='top-story-item') + h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) + h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + + h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) + h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + + return h1s, h2s, h3s + + @classmethod + def _get_description(cls, soup): + try: + return NewsSource._get_description(soup) + except Exception: + # fall back on grabbing text from the article + desc = soup.find('div', class_='field-items') + return desc.text[:200].rsplit(' ', 1)[0] + -- cgit v1.2.3