diff options
author | Matt Singleton <msingleton@aclu.org> | 2017-10-14 17:21:20 -0400 |
---|---|---|
committer | Matt Singleton <msingleton@aclu.org> | 2017-10-14 17:25:31 -0400 |
commit | beb04a9bb4935068926e167a38a3fdf9ec37c049 (patch) | |
tree | b12fc54de4fb065fc47c392b4d14daaecf500fa7 | |
parent | 753b48246a8e3eb5bfffa77814ff297287951e03 (diff) |
Christian Science Monitor closes #6
-rw-r--r-- | unbiased/sources/csm.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/unbiased/sources/csm.py b/unbiased/sources/csm.py new file mode 100644 index 0000000..4e1eea5 --- /dev/null +++ b/unbiased/sources/csm.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class CSM(NewsSource): + + name = 'Christian Science Monitor' + shortname = 'csm' + url = 'https://www.csmonitor.com/USA' + + bad_titles = ['Change Agent'] + bad_imgs = ['csm_logo'] + bad_urls = ['difference-maker'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='block-0-0')\ + .find('h3', class_='story_headline')\ + .a['href'] + h1s = (h1,) + + # get secondary headlines + h2_blocks = soup.find_all('div', id=['block-1-0', 'block-0-1']) + h2s = [] + for block in h2_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h2s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3_blocks = soup.find_all('div', id='block-0-2') + h3s = [] + for block in h3_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h3s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h3s = tuple(x['href'] for x in h3s) + + return h1s, h2s, h3s |