diff options
author | Matt Singleton <matt@xcolour.net> | 2017-09-12 22:53:23 -0400 |
---|---|---|
committer | Matt Singleton <matt@xcolour.net> | 2017-09-12 22:53:23 -0400 |
commit | 9b5f9b4f1be2563ebb639f90a943649d0165b7b8 (patch) | |
tree | 0681b6ca4ed538d85208be18b2fafa8e7bd4012b | |
parent | 22473179b0677ad50fd0d3284726683cf00c54e0 (diff) |
new source The Guardian
-rw-r--r-- | unbiased/sources/guardian.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py new file mode 100644 index 0000000..dff098b --- /dev/null +++ b/unbiased/sources/guardian.py @@ -0,0 +1,38 @@ +import urllib +import html + +from unbiased.sources.base import NewsSource + +class TheGuardian(NewsSource): + + name = 'The Guardian' + shortname = 'Guardian' + url = 'https://www.theguardian.com/us' + + bad_authors = ['Tom McCarthy', 'Andy Hunter'] + bad_urls = ['https://www.theguardian.com/profile/ben-jacobs'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + url_groups = [] + for htag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + hblocks = soup.find('section', id='headlines').find_all(htag) + urls = [x.a['href'] for x in hblocks] + url_groups.append(urls) + url_groups = [x for x in url_groups if len(url_groups) > 0] + if len(url_groups) < 3: + raise Exception('not enough article groups on Guardian home page!') + + return tuple(url_groups[0]), tuple(url_groups[1]), tuple(url_groups[2]) + + @classmethod + def _get_image(cls, soup): + if soup.find('img', class_='maxed'): + img = soup.find('img', class_='maxed')['src'] + if soup.find('meta', itemprop='image'): + img = soup.find('meta', itemprop='image')['content'] + if soup.find('img', class_='immersive-main-media__media'): + img = soup.find('img', class_='immersive-main-media__media')['src'] + return html.unescape(img) |