summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <matt@xcolour.net>2017-10-22 21:29:26 -0400
committerMatt Singleton <matt@xcolour.net>2017-10-22 21:29:26 -0400
commitf9ef3b242558dca8e2ad6a5592eee13be4d592d1 (patch)
tree94e614705e1951ab41d2de7d0752294ac1c1539e
parentf9b7022db9f66a7c72fb19a5c74ef9a25c04a6f0 (diff)
yank hi-res guardian image urls out of a piece of js, closes #14
-rw-r--r--unbiased/sources/guardian.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
index 5a1c3dd..044c227 100644
--- a/unbiased/sources/guardian.py
+++ b/unbiased/sources/guardian.py
@@ -1,4 +1,5 @@
import html
+import re
from unbiased.sources.base import NewsSource
@@ -11,6 +12,8 @@ class TheGuardian(NewsSource):
bad_authors = ['Tom McCarthy', 'Andy Hunter']
bad_urls = ['https://www.theguardian.com/profile/ben-jacobs']
+ _img_pat = re.compile('"srcsets":"(.*?)"')
+
@classmethod
def _fetch_urls(cls):
soup = cls._fetch_content(cls.url)
@@ -28,6 +31,21 @@ class TheGuardian(NewsSource):
@classmethod
def _get_image(cls, soup):
+ # the guardian watermarks the images in their <meta> tags,
+ # and the <img> of the hero is a very small resolution,
+ # but we can pull a hi-res image url out of the <script>
+ # body inside of the page.
+ try:
+ script = soup.find('script', id='gu').text
+ matches = cls._img_pat.search(script)
+ if matches:
+ srcsets = matches.group(1).split(',')
+ srcsets = sorted([(int(y.strip('w')), x.strip()) for x, y in [x.rsplit(' ', 1) for x in srcsets]])
+ return html.unescape(srcsets[-1][1])
+ except Exception:
+ pass
+
+ # if that ugly, brittle shit fails, fall back on the low-res image
if soup.find('img', class_='maxed'):
img = soup.find('img', class_='maxed')['src']
if soup.find('meta', itemprop='image'):