diff options
author | Matt Singleton <msingleton@aclu.org> | 2017-10-14 17:46:18 -0400 |
---|---|---|
committer | Matt Singleton <msingleton@aclu.org> | 2017-10-14 17:46:18 -0400 |
commit | ff01ea02a0cd85d7199455de1a053b57fdc27eee (patch) | |
tree | 62269832d76eff5f75229f0cb467e09bed2976c2 | |
parent | beb04a9bb4935068926e167a38a3fdf9ec37c049 (diff) |
CBS News closes #5
-rw-r--r-- | unbiased/sources/cbs.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/unbiased/sources/cbs.py b/unbiased/sources/cbs.py new file mode 100644 index 0000000..295e671 --- /dev/null +++ b/unbiased/sources/cbs.py @@ -0,0 +1,37 @@ +from unbiased.sources.base import NewsSource + +class CBS(NewsSource): + + name = 'CBS News' + shortname = 'cbs' + url = 'https://www.cbsnews.com/' + + bad_titles = ['60 Minutes'] + bad_descriptions = ['60 Minutes'] + bad_urls = ['whats-in-the-news-coverart'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('h1', class_='title') + # sometimes they lead with a video + # if so, we'll pull the first h2 into the h1 slot later + if h1 is not None: + h1s = (h1.a['href'],) + + # get secondary headlines + h2s = soup.find('div', attrs={'data-tb-region': 'Big News Area Side Assets'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h2s = tuple(x.a['href'] for x in h2s) + if h1 is None: + h1s = (h2s[0],) + h2s = tuple(h2s[1:]) + + # get tertiary headlines + h3s = soup.find('div', attrs={'data-tb-region': 'Hard News'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h3s = tuple(x.a['href'] for x in h3s[:5]) + + return h1s, h2s, h3s |