unbiased/sources/thehill.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

import urllib

from unbiased.sources.base import NewsSource

class TheHill(NewsSource):

    name = 'The Hill'
    shortname = 'Hill'
    url = 'http://thehill.com'

    bad_titles = ['THE MEMO']
    bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg']

    @classmethod
    def _fetch_urls(cls):
        soup = cls._fetch_content(cls.url)

        h1 = soup.find('h1', class_='top-story-headline')\
            .find('a')['href']
        h1 = urllib.parse.urljoin(cls.url, h1)
        h1s = (h1,)

        h23s = soup.find('div', class_='section-top-content')\
                  .find_all('div', class_='top-story-item')
        h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']])
        h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)

        h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']])
        h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)

        return h1s, h2s, h3s

    @classmethod
    def _get_description(cls, soup):
        try:
            return NewsSource._get_description(soup)
        except Exception:
            # fall back on grabbing text from the article
            desc = soup.find('div', class_='field-items')
            return desc.text[:200].rsplit(' ', 1)[0]