From f9ef3b242558dca8e2ad6a5592eee13be4d592d1 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 22 Oct 2017 21:29:26 -0400 Subject: yank hi-res guardian image urls out of a piece of js, closes #14 --- unbiased/sources/guardian.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py index 5a1c3dd..044c227 100644 --- a/unbiased/sources/guardian.py +++ b/unbiased/sources/guardian.py @@ -1,4 +1,5 @@ import html +import re from unbiased.sources.base import NewsSource @@ -11,6 +12,8 @@ class TheGuardian(NewsSource): bad_authors = ['Tom McCarthy', 'Andy Hunter'] bad_urls = ['https://www.theguardian.com/profile/ben-jacobs'] + _img_pat = re.compile('"srcsets":"(.*?)"') + @classmethod def _fetch_urls(cls): soup = cls._fetch_content(cls.url) @@ -28,6 +31,21 @@ class TheGuardian(NewsSource): @classmethod def _get_image(cls, soup): + # the guardian watermarks the images in their tags, + # and the of the hero is a very small resolution, + # but we can pull a hi-res image url out of the