diff options
author | Matt Singleton <matt@xcolour.net> | 2017-10-22 21:29:26 -0400 |
---|---|---|
committer | Matt Singleton <matt@xcolour.net> | 2017-10-22 21:29:26 -0400 |
commit | f9ef3b242558dca8e2ad6a5592eee13be4d592d1 (patch) | |
tree | 94e614705e1951ab41d2de7d0752294ac1c1539e | |
parent | f9b7022db9f66a7c72fb19a5c74ef9a25c04a6f0 (diff) |
yank hi-res guardian image urls out of a piece of js, closes #14
-rw-r--r-- | unbiased/sources/guardian.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py index 5a1c3dd..044c227 100644 --- a/unbiased/sources/guardian.py +++ b/unbiased/sources/guardian.py @@ -1,4 +1,5 @@ import html +import re from unbiased.sources.base import NewsSource @@ -11,6 +12,8 @@ class TheGuardian(NewsSource): bad_authors = ['Tom McCarthy', 'Andy Hunter'] bad_urls = ['https://www.theguardian.com/profile/ben-jacobs'] + _img_pat = re.compile('"srcsets":"(.*?)"') + @classmethod def _fetch_urls(cls): soup = cls._fetch_content(cls.url) @@ -28,6 +31,21 @@ class TheGuardian(NewsSource): @classmethod def _get_image(cls, soup): + # the guardian watermarks the images in their <meta> tags, + # and the <img> of the hero is a very small resolution, + # but we can pull a hi-res image url out of the <script> + # body inside of the page. + try: + script = soup.find('script', id='gu').text + matches = cls._img_pat.search(script) + if matches: + srcsets = matches.group(1).split(',') + srcsets = sorted([(int(y.strip('w')), x.strip()) for x, y in [x.rsplit(' ', 1) for x in srcsets]]) + return html.unescape(srcsets[-1][1]) + except Exception: + pass + + # if that ugly, brittle shit fails, fall back on the low-res image if soup.find('img', class_='maxed'): img = soup.find('img', class_='maxed')['src'] if soup.find('meta', itemprop='image'): |