news source for The Hill

author: Matt Singleton <matt@xcolour.net> 2017-09-11 23:27:50 -0400
committer: Matt Singleton <matt@xcolour.net> 2017-09-11 23:27:50 -0400
commit: 0584698995cc748434cddd4a1a3baa56ff7aa180 (patch)
tree: 9afabc73f8269880fc9740fac00733fb94520732
parent: e53b324c148e81f4e4dff009670639825f2a2006 (diff)
2 files changed, 43 insertions, 2 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
index 68e7f0d..b4bff75 100644
--- a/unbiased/sources/base.py
+++ b/unbiased/sources/base.py
@@ -174,8 +174,8 @@ class NewsSource(object):
 
     @classmethod
     def _get_author(cls, soup):
-        for author_tag in ['article:author', 'dc.creator', 'property="author']:
-            author = soup.find(author_tag)
+        for author_tag in ['article:author', 'dc.creator', 'author']:
+            author = soup.find('meta', property=author_tag)
             if author is None:
                 continue
             return author['content']
diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py
new file mode 100644
index 0000000..c678261
--- /dev/null
+++ b/unbiased/sources/thehill.py
@@ -0,0 +1,41 @@
+import urllib
+
+from unbiased.sources.base import NewsSource
+
+class TheHill(NewsSource):
+
+    name = 'The Hill'
+    shortname = 'Hill'
+    url = 'http://thehill.com'
+
+    bad_titles = ['THE MEMO']
+    bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        h1 = soup.find('h1', class_='top-story-headline')\
+            .find('a')['href']
+        h1 = urllib.parse.urljoin(cls.url, h1)
+        h1s = (h1,)
+
+        h23s = soup.find('div', class_='section-top-content')\
+                  .find_all('div', class_='top-story-item')
+        h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']])
+        h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)
+
+        h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']])
+        h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)
+
+        return h1s, h2s, h3s
+
+    @classmethod
+    def _get_description(cls, soup):
+        try:
+            return NewsSource._get_description(soup)
+        except Exception:
+            # fall back on grabbing text from the article
+            desc = soup.find('div', class_='field-items')
+            return desc.text[:200].rsplit(' ', 1)[0]
+
author	Matt Singleton <matt@xcolour.net>	2017-09-11 23:27:50 -0400
committer	Matt Singleton <matt@xcolour.net>	2017-09-11 23:27:50 -0400
commit	0584698995cc748434cddd4a1a3baa56ff7aa180 (patch)
tree	9afabc73f8269880fc9740fac00733fb94520732
parent	e53b324c148e81f4e4dff009670639825f2a2006 (diff)