diff options
author | Matt Singleton <matt@xcolour.net> | 2017-09-11 23:27:50 -0400 |
---|---|---|
committer | Matt Singleton <matt@xcolour.net> | 2017-09-11 23:27:50 -0400 |
commit | 0584698995cc748434cddd4a1a3baa56ff7aa180 (patch) | |
tree | 9afabc73f8269880fc9740fac00733fb94520732 | |
parent | e53b324c148e81f4e4dff009670639825f2a2006 (diff) |
news source for The Hill
-rw-r--r-- | unbiased/sources/base.py | 4 | ||||
-rw-r--r-- | unbiased/sources/thehill.py | 41 |
2 files changed, 43 insertions, 2 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 68e7f0d..b4bff75 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -174,8 +174,8 @@ class NewsSource(object): @classmethod def _get_author(cls, soup): - for author_tag in ['article:author', 'dc.creator', 'property="author']: - author = soup.find(author_tag) + for author_tag in ['article:author', 'dc.creator', 'author']: + author = soup.find('meta', property=author_tag) if author is None: continue return author['content'] diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py new file mode 100644 index 0000000..c678261 --- /dev/null +++ b/unbiased/sources/thehill.py @@ -0,0 +1,41 @@ +import urllib + +from unbiased.sources.base import NewsSource + +class TheHill(NewsSource): + + name = 'The Hill' + shortname = 'Hill' + url = 'http://thehill.com' + + bad_titles = ['THE MEMO'] + bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('h1', class_='top-story-headline')\ + .find('a')['href'] + h1 = urllib.parse.urljoin(cls.url, h1) + h1s = (h1,) + + h23s = soup.find('div', class_='section-top-content')\ + .find_all('div', class_='top-story-item') + h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) + h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + + h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) + h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + + return h1s, h2s, h3s + + @classmethod + def _get_description(cls, soup): + try: + return NewsSource._get_description(soup) + except Exception: + # fall back on grabbing text from the article + desc = soup.find('div', class_='field-items') + return desc.text[:200].rsplit(' ', 1)[0] + |