yank hi-res guardian image urls out of a piece of js, closes #14

author: Matt Singleton <matt@xcolour.net> 2017-10-22 21:29:26 -0400
committer: Matt Singleton <matt@xcolour.net> 2017-10-22 21:29:26 -0400
commit: f9ef3b242558dca8e2ad6a5592eee13be4d592d1 (patch)
tree: 94e614705e1951ab41d2de7d0752294ac1c1539e
parent: f9b7022db9f66a7c72fb19a5c74ef9a25c04a6f0 (diff)
1 files changed, 18 insertions, 0 deletions
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
index 5a1c3dd..044c227 100644
--- a/unbiased/sources/guardian.py
+++ b/unbiased/sources/guardian.py
@@ -1,4 +1,5 @@
 import html
+import re
 
 from unbiased.sources.base import NewsSource
 
@@ -11,6 +12,8 @@ class TheGuardian(NewsSource):
     bad_authors = ['Tom McCarthy', 'Andy Hunter']
     bad_urls = ['https://www.theguardian.com/profile/ben-jacobs']
 
+    _img_pat = re.compile('"srcsets":"(.*?)"')
+
     @classmethod
     def _fetch_urls(cls):
         soup = cls._fetch_content(cls.url)
@@ -28,6 +31,21 @@ class TheGuardian(NewsSource):
 
     @classmethod
     def _get_image(cls, soup):
+        # the guardian watermarks the images in their <meta> tags,
+        # and the <img> of the hero is a very small resolution,
+        # but we can pull a hi-res image url out of the <script>
+        # body inside of the page.
+        try:
+            script = soup.find('script', id='gu').text
+            matches = cls._img_pat.search(script)
+            if matches:
+                srcsets = matches.group(1).split(',')
+                srcsets = sorted([(int(y.strip('w')), x.strip()) for x, y in [x.rsplit(' ', 1) for x in srcsets]])
+                return html.unescape(srcsets[-1][1])
+        except Exception:
+            pass
+
+        # if that ugly, brittle shit fails, fall back on the low-res image
         if soup.find('img', class_='maxed'):
             img =  soup.find('img', class_='maxed')['src']
         if soup.find('meta', itemprop='image'):
author	Matt Singleton <matt@xcolour.net>	2017-10-22 21:29:26 -0400
committer	Matt Singleton <matt@xcolour.net>	2017-10-22 21:29:26 -0400
commit	f9ef3b242558dca8e2ad6a5592eee13be4d592d1 (patch)
tree	94e614705e1951ab41d2de7d0752294ac1c1539e
parent	f9b7022db9f66a7c72fb19a5c74ef9a25c04a6f0 (diff)