From 9b5f9b4f1be2563ebb639f90a943649d0165b7b8 Mon Sep 17 00:00:00 2001
From: Matt Singleton <matt@xcolour.net>
Date: Tue, 12 Sep 2017 22:53:23 -0400
Subject: new source The Guardian

---
 unbiased/sources/guardian.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 unbiased/sources/guardian.py

diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
new file mode 100644
index 0000000..dff098b
--- /dev/null
+++ b/unbiased/sources/guardian.py
@@ -0,0 +1,38 @@
+import urllib
+import html
+
+from unbiased.sources.base import NewsSource
+
+class TheGuardian(NewsSource):
+
+    name = 'The Guardian'
+    shortname = 'Guardian'
+    url = 'https://www.theguardian.com/us'
+
+    bad_authors = ['Tom McCarthy', 'Andy Hunter']
+    bad_urls = ['https://www.theguardian.com/profile/ben-jacobs']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        url_groups = []
+        for htag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            hblocks = soup.find('section', id='headlines').find_all(htag)
+            urls = [x.a['href'] for x in hblocks]
+            url_groups.append(urls)
+        url_groups = [x for x in url_groups if len(url_groups) > 0]
+        if len(url_groups) < 3:
+            raise Exception('not enough article groups on Guardian home page!')
+
+        return tuple(url_groups[0]), tuple(url_groups[1]), tuple(url_groups[2])
+
+    @classmethod
+    def _get_image(cls, soup):
+        if soup.find('img', class_='maxed'):
+            img =  soup.find('img', class_='maxed')['src']
+        if soup.find('meta', itemprop='image'):
+            img = soup.find('meta', itemprop='image')['content']
+        if soup.find('img', class_='immersive-main-media__media'):
+            img = soup.find('img', class_='immersive-main-media__media')['src']
+        return html.unescape(img)
-- 
cgit v1.2.3