major refactor of news source building

author: Matt Singleton <matt@xcolour.net> 2017-09-10 13:09:12 -0400
committer: Matt Singleton <matt@xcolour.net> 2017-09-10 13:09:12 -0400
commit: d0c7c0541013cc9472b38ccfd614a314e9a86d70 (patch)
tree: 761aa2e7cec0e9d7ce25b516aef5875b962fc032
parent: 38a64b344bc6a25ce0faf17ddb7ed3439d0d007d (diff)
6 files changed, 320 insertions, 104 deletions
diff --git a/requirements.txt b/requirements.txt
index 3767095..0d53cea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 beautifulsoup4~=4.6.0
 Jinja2~=2.9.6
-lxml=~=3.8.0
+lxml~=3.8.0
 Pillow~=4.2.1
 requests~=2.18.4
diff --git a/unbiased/main.py b/unbiased/main.py
index 4ccda24..949e646 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -5,9 +5,8 @@ import logging
 import logging.config
 import time
 
-from unbiased.unbiasedObjects import *
-from unbiased.unbiasedFunctions import *
-from unbiased.parser import *
+from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, writeOutputHTML
+from unbiased.sources import get_sources
 
 logger = logging.getLogger('unbiased')
 
@@ -52,6 +51,7 @@ def main():
     parser.add_argument('-l', '--log-dir', help='location to write detailed logs')
     parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode')
     parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit')
+    parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None)
     args = parser.parse_args()
 
     if args.log_dir:
@@ -67,7 +67,7 @@ def main():
     while True:
         logger.info('Starting crawl')
         start = time.time()
-        run(args.webroot)
+        run(args.webroot, args.sources)
         finish = time.time()
         runtime = finish - start
         sleeptime = crawl_frequency - runtime
@@ -77,51 +77,33 @@ def main():
         if sleeptime > 0:
             time.sleep(sleeptime)
 
-def run(webroot):
-    sources = []
-
-    '''
-    SOURCES TO ADD NEXT:
-    -REUTERS
-    -Town Hall
-    '''
-
-    logger.debug('Running with webroot="{}"'.format(webroot))
-
-    ### These values have to be the second half of the function name
-    ### E.g. Guardian calls buildGuardian(), etc.
-    sourceFnArr = [
-        'Guardian',
-        'TheHill',
-        'NPR',
-        'BBC',
-        'NBC',
-        'CBS',
-        'FoxNews',
-        'WashTimes',
-        'CSM',
-        'ABC',
-    ]
-
-    for source in sourceFnArr:
-        logger.info('Crawling {}'.format(source))
+def run(webroot, source_names):
+
+    logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names))
+
+    sources = get_sources()
+    print(sources)
+    if source_names is None:
+        sources = sources.values()
+    else:
+        sources = [sources[x] for x in source_names]
+
+    built_sources = []
+    for source in sources:
+        logger.info('Crawling {}'.format(source.name))
         tries = 0
         while tries < 3:
             time.sleep(tries)
             try:
-                fn = 'build' + source
-                possibles = globals().copy()
-                possibles.update(locals())
-                method = possibles.get(fn)
-                src = method()
-                sources.append(src)
+                built_sources.append(source.build())
                 break
             except Exception as ex:
                 tries += 1
                 if tries == 3:
-                    logger.error('Build failed. source={} ex={}'.format(source, ex))
+                    logger.error('Build failed. source={} ex={}'.format(source.name, ex))
                 else:
-                    logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
+                    logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex))
+    sources = tuple(built_sources)
     logger.info('Parsed home pages for: {}'.format([x.name for x in sources]))
 
     top_stories, middle_stories, bottom_stories = pickStories(sources)
diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py
new file mode 100644
index 0000000..e4a473a
--- /dev/null
+++ b/unbiased/sources/__init__.py
@@ -0,0 +1,10 @@
+import importlib
+import pkgutil
+
+from unbiased.sources.base import NewsSource
+
+def get_sources():
+    for loader, name, is_pkg in pkgutil.walk_packages(__path__):
+        if name != 'base':
+            importlib.import_module('unbiased.sources.' + name)
+    return {x.shortname.lower(): x for x in NewsSource.__subclasses__()}
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
new file mode 100644
index 0000000..9dc14fd
--- /dev/null
+++ b/unbiased/sources/base.py
@@ -0,0 +1,222 @@
+import collections
+import html
+import logging
+import urllib
+
+from bs4 import BeautifulSoup
+import requests
+
+logger = logging.getLogger('unbiased')
+
+class Article(object):
+
+    def __init__(self, source, title, author, description, url, img):
+        self.source = source
+        self.title = title
+        self.author = author
+        self.description = description
+        self.url = url
+        self.img = img
+
+    def __repr__(self):
+        return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img)
+
+class NewsSource(object):
+    """
+    Abstract base class.
+    To implement:
+     - set 'name', 'shortname', and 'url'
+     - set 'bad_' variables to blacklist terms and phrases
+     - implement '_fetch_urls()', which should return three tuples
+       of urls, one for each tier
+     - override any of the '_get_*()' functions as necessary
+    """
+    # TODO: replace all string parsing with bs4
+
+    name = None
+    shortname = None
+    url = None
+
+    bad_titles = None
+    bad_authors = None
+    bad_descriptions = None
+    bad_imgs = None
+    bad_urls = None
+
+    def __init__(self, h1s, h2s, h3s):
+        self.h1s = h1s
+        self.h2s = h2s
+        self.h3s = h3s
+
+    @classmethod
+    def build(cls):
+        h1s, h2s, h3s = cls._fetch_urls()
+        h1s = tuple(cls._fix_url(x) for x in h1s)
+        h2s = tuple(cls._fix_url(x) for x in h2s)
+        h3s = tuple(cls._fix_url(x) for x in h3s)
+        h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s)
+        h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s)
+        h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s)
+        return cls(h1s, h2s, h3s)
+
+    @classmethod
+    def _fetch_content(cls, url):
+        res = requests.get(url)
+        if res.status_code == 200:
+            content = res.text
+        else:
+            raise Exception("Failed to download {}".format(url))
+        return BeautifulSoup(content, 'lxml')
+
+    @classmethod
+    def _fix_url(cls, url, scheme='http'):
+        """
+        Make sure they have a scheme.
+        Trim any query parameters.
+        """
+        # TODO: proper URL parsing
+        if url.startswith('//'):
+            url = '{}:{}'.format(scheme, x)
+        url = url.split('?')[0]
+        return url
+
+    @classmethod
+    def _remove_duplicates(cls, h1s, h2s, h3s):
+        h2s = tuple(x for x in h2s if x not in h1s)
+        h3s = tuple(x for x in h3s if x not in h1s and x not in h2s)
+        return h1s, h2s, h3s
+
+    @classmethod
+    def _remove_bad_stories(cls, articles, element, filters):
+        if filters is None:
+            return articles
+        new_articles = []
+        for article in articles:
+            save = True
+            for f in filters:
+                if f in getattr(article, element):
+                    save = False
+                    break
+            if save:
+                new_articles.append(article)
+        return tuple(new_articles)
+
+    @classmethod
+    def _remove_all_bad_stories(cls, h1s, h2s, h3s):
+        new_articles = []
+        for articles in [h1s, h2s, h3s]:
+            articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles)
+            articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions)
+            articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors)
+            articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs)
+            articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls)
+            new_articles.append(articles)
+        if len(new_articles[0]) == 0 and len(new_articles[1]) > 0:
+            new_articles[0].append(new_articles[1].pop())
+        return tuple(tuple(x) for x in new_articles)
+
+    @classmethod
+    def _fetch_articles(cls, h1s, h2s, h3s):
+        ret = []
+        for urls in [h1s, h2s, h3s]:
+            articles = []
+            for url in urls:
+                article = cls._fetch_article(url)
+                if article is not None:
+                    articles.append(article)
+            ret.append(articles)
+        return tuple(tuple(x) for x in ret)
+
+    @classmethod
+    def _fetch_article(cls, url):
+        #soup = cls._fetch_content(url)
+
+        logger.debug(cls.name)
+        logger.debug(url)
+
+        url_parts = urllib.parse.urlparse(url)
+        scheme = url_parts.scheme
+
+        # download url
+        try:
+            res = requests.get(url)
+        except Exception as ex:
+            logger.debug("""ARTICLE DOWNLOADING ERROR
+            SOURCE:\t{}
+            URL:\t{}""".format(cls.name, url))
+            return None
+
+        if res.status_code == 200:
+            content = res.text
+        else:
+            logger.debug("""ARTICLE DOWNLOADING ERROR
+            SOURCE:\t{}
+            URL:\t{}""".format(cls.name, url))
+            return None
+
+        try:
+            img = cls._get_image(content)
+            img = urllib.parse.urlparse(img, scheme=scheme).geturl()
+            logger.debug(img)
+
+            title = cls._get_title(content)
+            logger.debug(title)
+
+            author = cls._get_author(content)
+            logger.debug(author)
+
+            description = cls._get_description(content)
+            logger.debug(description)
+            description = cls._remove_self_refs(description)
+            logger.debug(description)
+        except Exception:
+            logger.debug("""ARTICLE PARSING ERROR
+            SOURCE:\t{}
+            URL:\t{}""".format(cls.name, url))
+            return None
+
+        return Article(cls.name, title, author, description, url, img)
+
+    @classmethod
+    def _get_image(cls, content):
+        img = content.split('og:image" content=')[1][1:].split('>')[0]
+        if img[-1] == '/':
+            #because the quote separator could be ' or ",
+            #trim to just before it then lop it off
+            img = img[:-1].strip()
+        img = img[:-1]
+        return img
+
+    @classmethod
+    def _get_title(cls, content):
+        title=content.split('og:title" content=')[1][1:].split('>')[0]
+        if title[-1]=='/':
+            title=title[:-1].strip()
+        title=title[:-1]
+        return title
+
+    @classmethod
+    def _get_author(cls, content):
+        author = ''
+        authorTags = ['article:author', 'dc.creator', 'property="author']
+        for tag in authorTags:
+            if tag in content:
+                author = content.split(tag+'" content=')[1][1:].split('>')[0]
+                author = author[:-1]
+                break
+        return author
+
+    @classmethod
+    def _get_description(cls, content):
+        description = content.split('og:description" content=')[1][1:].split('>')[0]
+        if description[-1] == '/':
+            description = description[:-1].strip()
+        description = description[:-1]
+        return description
+
+    @classmethod
+    def _remove_self_refs(cls, description):
+        description = description.replace(cls.name+"'s", '***')
+        description = description.replace(cls.name+"'", '***')
+        description = description.replace(cls.name, '***')
+        return description
diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py
new file mode 100644
index 0000000..ce7730f
--- /dev/null
+++ b/unbiased/sources/fox.py
@@ -0,0 +1,41 @@
+from unbiased.sources.base import NewsSource
+
+class Fox(NewsSource):
+
+    name = 'Fox News'
+    shortname = 'Fox'
+    url = 'http://www.foxnews.com'
+
+    bad_titles = ['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
+    bad_descriptions = ['Sean Hannity']
+    bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz']
+    bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
+    bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com']
+
+    @classmethod
+    def _fetch_urls(cls):
+        """
+        Returns three tuples of urls, one for each of
+        the three tiers.
+        """
+        soup = cls._fetch_content(cls.url)
+
+        # get primary headline
+        h1 = soup.find('div', id='big-top')\
+                 .find('div', class_='primary')\
+                 .find('h1')\
+                 .find('a')['href']
+        h1s = (h1,)
+
+        # get secondary headlines
+        h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
+        h2s = tuple(x['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3s = []
+        for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
+            for li in ul.find_all('li', recursive=False):
+                h3s.append(li.find('a')['href'])
+        h3s = tuple(h3s)
+
+        return h1s, h2s, h3s
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 6ec89b7..7825d93 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -11,8 +11,6 @@ import urllib.parse
 from PIL import Image
 import requests
 
-from unbiased.unbiasedObjects import *
-
 logger = logging.getLogger('unbiased')
 
 #take in a url and delimiters, return twitter card
@@ -141,69 +139,30 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd,
         return None
 
 
-def pickStories(newsSourceArr):
-    # TODO: refactor to avoid infinite loops
-    #set the random order for sources
-    h1RandomSources=[]
-    guard = 0
-    while len(h1RandomSources)<4:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h1Arr)>0:
-            if x not in h1RandomSources:
-                h1RandomSources.append(x)
-        else:
-            logger.debug('No H1 stories in '+newsSourceArr[x].name)
-        guard += 1
-        if guard > 100:
-            return [], [], []
-
-    #For h2s and h3s, select N random sources (can repeat), then
-    #a non-repetitive random article from within
-    h2RandomPairs=[]
-    while len(h2RandomPairs) < 6:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h2Arr) > 0:
-            y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
-            pair=[x,y]
-            if not pair in h2RandomPairs:
-                h2RandomPairs.append(pair)
-        else:
-            logger.debug('No H2 stories in '+newsSourceArr[x].name)
-
-    h3RandomPairs=[]
-    while len(h3RandomPairs) < 12:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h3Arr) > 0:
-            y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
-            pair=[x,y]
-            if not pair in h3RandomPairs:
-                h3RandomPairs.append(pair)
-        else:
-            logger.debug('No H3 stories in '+newsSourceArr[x].name)
+def pick_randoms(story_lists, length, per_source):
+    """
+    Return a randomly chosen list of 'length' stories, picking at
+    most 'per_source' stories from each source.
+    """
+    # TODO: weighting is incorrect if a source has fewer than 'per_source' articles
+    urandom = random.SystemRandom()
+    candidates = []
+    for stories in story_lists:
+        indexes = list(range(len(stories)))
+        urandom.shuffle(indexes)
+        random_indexes = indexes[:per_source]
+        candidates.extend([stories[x] for x in random_indexes])
+    indexes = list(range(len(candidates)))
+    urandom.shuffle(indexes)
+    random_indexes = indexes[:length]
+    return tuple(candidates[x] for x in random_indexes)
 
-    # collect articles for each section
-    image_index = 0
 
-    top_stories = []
-    for i in range(len(h1RandomSources)):
-        source=newsSourceArr[h1RandomSources[i]]
-        randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
-        article=source.h1Arr[randomArticle]
-        top_stories.append(article)
-
-    middle_stories = []
-    for i in range(len(h2RandomPairs)):
-        pair=h2RandomPairs[i]
-        article=newsSourceArr[pair[0]].h2Arr[pair[1]]
-        middle_stories.append(article)
-
-    bottom_stories = []
-    for i in range(len(h3RandomPairs)):
-        pair=h3RandomPairs[i]
-        article=newsSourceArr[pair[0]].h3Arr[pair[1]]
-        bottom_stories.append(article)
-
-    return top_stories, middle_stories, bottom_stories
+def pickStories(newsSourceArr):
+    h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1)
+    h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2)
+    h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2)
+    return h1s, h2s, h3s
 
 def buildOutput(top_stories, middle_stories, bottom_stories):
     #read in the template html file
@@ -270,6 +229,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200):
     # resize if larger
     if target_width * 2 < width or target_height * 2 < height:
         img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
+    # TODO: fill with a neutral color instead of just discarding alpha channel
+    img = img.convert('RGB')
     # TODO: create retina images
     jpg_name = 'img{}.jpg'.format(index)
     out_file = os.path.join(webroot, jpg_name)
author	Matt Singleton <matt@xcolour.net>	2017-09-10 13:09:12 -0400
committer	Matt Singleton <matt@xcolour.net>	2017-09-10 13:09:12 -0400
commit	d0c7c0541013cc9472b38ccfd614a314e9a86d70 (patch)
tree	761aa2e7cec0e9d7ce25b516aef5875b962fc032
parent	38a64b344bc6a25ce0faf17ddb7ed3439d0d007d (diff)