summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <msingleton@aclu.org>2017-10-14 17:21:20 -0400
committerMatt Singleton <msingleton@aclu.org>2017-10-14 17:25:31 -0400
commitbeb04a9bb4935068926e167a38a3fdf9ec37c049 (patch)
treeb12fc54de4fb065fc47c392b4d14daaecf500fa7
parent753b48246a8e3eb5bfffa77814ff297287951e03 (diff)
Christian Science Monitor closes #6
-rw-r--r--unbiased/sources/csm.py41
1 files changed, 41 insertions, 0 deletions
diff --git a/unbiased/sources/csm.py b/unbiased/sources/csm.py
new file mode 100644
index 0000000..4e1eea5
--- /dev/null
+++ b/unbiased/sources/csm.py
@@ -0,0 +1,41 @@
+from unbiased.sources.base import NewsSource
+
+class CSM(NewsSource):
+
+ name = 'Christian Science Monitor'
+ shortname = 'csm'
+ url = 'https://www.csmonitor.com/USA'
+
+ bad_titles = ['Change Agent']
+ bad_imgs = ['csm_logo']
+ bad_urls = ['difference-maker']
+
+ @classmethod
+ def _fetch_urls(cls):
+ soup = cls._fetch_content(cls.url)
+
+ # get primary headline
+ h1 = soup.find('div', id='block-0-0')\
+ .find('h3', class_='story_headline')\
+ .a['href']
+ h1s = (h1,)
+
+ # get secondary headlines
+ h2_blocks = soup.find_all('div', id=['block-1-0', 'block-0-1'])
+ h2s = []
+ for block in h2_blocks:
+ hblocks = block.find_all('h3', class_='story_headline')
+ for hblock in hblocks:
+ h2s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']]
+ h2s = tuple(x['href'] for x in h2s)
+
+ # get tertiary headlines
+ h3_blocks = soup.find_all('div', id='block-0-2')
+ h3s = []
+ for block in h3_blocks:
+ hblocks = block.find_all('h3', class_='story_headline')
+ for hblock in hblocks:
+ h3s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']]
+ h3s = tuple(x['href'] for x in h3s)
+
+ return h1s, h2s, h3s