19 files changed, 694 insertions, 1448 deletions
diff --git a/.gitignore b/.gitignore
index 9e0f924..ad2b57a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ venv/
 unbiased.egg-info/
 #*
 .#*
+*.swp
diff --git a/requirements.txt b/requirements.txt
index 3767095..0d53cea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 beautifulsoup4~=4.6.0
 Jinja2~=2.9.6
-lxml=~=3.8.0
+lxml~=3.8.0
 Pillow~=4.2.1
 requests~=2.18.4
diff --git a/unbiased/main.py b/unbiased/main.py
index 4ccda24..19fd05b 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 
 import argparse
+import io
 import logging
 import logging.config
 import time
 
-from unbiased.unbiasedObjects import *
-from unbiased.unbiasedFunctions import *
-from unbiased.parser import *
+from unbiased.util import pickStories, pullImage, buildOutput, write_files, write_static_files
+from unbiased.sources import get_sources
 
 logger = logging.getLogger('unbiased')
 
@@ -52,6 +52,7 @@ def main():
     parser.add_argument('-l', '--log-dir', help='location to write detailed logs')
     parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode')
     parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit')
+    parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None)
     args = parser.parse_args()
 
     if args.log_dir:
@@ -67,7 +68,7 @@ def main():
     while True:
         logger.info('Starting crawl')
         start = time.time()
-        run(args.webroot)
+        run(args.webroot, args.sources, args.debug)
         finish = time.time()
         runtime = finish - start
         sleeptime = crawl_frequency - runtime
@@ -77,51 +78,34 @@ def main():
         if sleeptime > 0:
             time.sleep(sleeptime)
 
-def run(webroot):
-    sources = []
-
-    '''
-    SOURCES TO ADD NEXT:
-    -REUTERS
-    -Town Hall
-    '''
-
-    logger.debug('Running with webroot="{}"'.format(webroot))
-
-    ### These values have to be the second half of the function name
-    ### E.g. Guardian calls buildGuardian(), etc.
-    sourceFnArr = [
-        'Guardian',
-        'TheHill',
-        'NPR',
-        'BBC',
-        'NBC',
-        'CBS',
-        'FoxNews',
-        'WashTimes',
-        'CSM',
-        'ABC',
-    ]
-
-    for source in sourceFnArr:
-        logger.info('Crawling {}'.format(source))
+def run(webroot, source_names, debug_mode=False):
+
+    logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names))
+
+    sources = get_sources()
+    if source_names is None:
+        sources = sources.values()
+    else:
+        sources = [sources[x] for x in source_names]
+
+    built_sources = []
+    for source in sources:
+        logger.info('Crawling {}'.format(source.name))
         tries = 0
         while tries < 3:
             time.sleep(tries)
             try:
-                fn = 'build' + source
-                possibles = globals().copy()
-                possibles.update(locals())
-                method = possibles.get(fn)
-                src = method()
-                sources.append(src)
+                built_sources.append(source.build())
                 break
             except Exception as ex:
+                if debug_mode is True:
+                    raise
                 tries += 1
                 if tries == 3:
-                    logger.error('Build failed. source={} ex={}'.format(source, ex))
+                    logger.error('Build failed. source={} ex={}'.format(source.name, ex))
                 else:
-                    logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
+                    logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex))
+    sources = tuple(built_sources)
     logger.info('Parsed home pages for: {}'.format([x.name for x in sources]))
 
     top_stories, middle_stories, bottom_stories = pickStories(sources)
@@ -129,20 +113,26 @@ def run(webroot):
     logger.info('Picked middle stories from: {}'.format([x.source for x in middle_stories]))
     logger.info('Picked bottom stories from: {}'.format([x.source for x in bottom_stories]))
 
+    files_to_write = {}
+
     # download images
     img_idx = 0
     for story in top_stories:
-        story.img = pullImage(story.img, img_idx, webroot, 350, 200)
+        story.img, img_jpg = pullImage(story.img, img_idx, webroot, 350, 200)
+        files_to_write[story.img] = img_jpg
         img_idx += 1
     for story in middle_stories:
-        story.img = pullImage(story.img, img_idx, webroot, 150, 100)
+        story.img, img_jpg =  pullImage(story.img, img_idx, webroot, 150, 100)
+        files_to_write[story.img] = img_jpg
         img_idx += 1
 
-    #build the output file HTML
-    outputHTML = buildOutput(top_stories, middle_stories, bottom_stories)
+    # build the output file HTML
+    output_html = buildOutput(top_stories, middle_stories, bottom_stories)
+    output_html = io.BytesIO(output_html.encode('utf8'))
+    files_to_write['index.html'] = output_html
 
-    #print the output file HTML
-    writeOutputHTML(outputHTML, webroot)
+    write_files(files_to_write, webroot)
+    write_static_files(webroot)
 
 if __name__=="__main__":
     main()
diff --git a/unbiased/parser.py b/unbiased/parser.py
deleted file mode 100755
index 399e0f2..0000000
--- a/unbiased/parser.py
+++ /dev/null
@@ -1,986 +0,0 @@
-#!/usr/bin/env python3
-
-import logging
-import os
-import re
-import urllib.parse
-
-from bs4 import BeautifulSoup
-import requests
-
-from unbiased.unbiasedObjects import *
-from unbiased.unbiasedFunctions import buildArticle
-
-logger = logging.getLogger('unbiased')
-
-
-'''
-Takes in a URL, downloads the file to a temp file,
-reads the file into a string, and returns that string
-'''
-def urlToContent(url, sourceEncoding='utf8'):
-    res = requests.get(url)
-    if res.status_code == 200:
-        return res.text
-    else:
-        raise Exception("Failed to download {}".format(url))
-
-
-'''
-Creates a new newsSource2 object. For each URL in h1-h3URLs,
-calls the file scraper and appends the new Article object.
-Returns a newsSource2 object
-'''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
-
-    url_parts = urllib.parse.urlparse(url)
-    scheme = url_parts.scheme
-    h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs]
-    h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs]
-    h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
-
-    h1Arr=[]
-    a=buildArticle(h1URLs[0], name)
-    if a==None:
-        logger.debug('H1 Nonetype in '+name)
-    else:
-        h1Arr.append(a)
-
-    h2Arr=[]
-    for x in h2URLs:
-        a=buildArticle(x, name)
-        if a!=None:
-            h2Arr.append(a)
-        else:
-            logger.debug('H2 Nonetype in '+name)
-
-    h3Arr=[]
-    for x in h3URLs:
-        a=buildArticle(x, name)
-        if a!=None:
-            h3Arr.append(a)
-        else:
-            logger.debug('H3 Nonetype in '+name)
-
-    #BUILD THE NEWS SOURCE
-    newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
-
-    return newsSource
-
-
-'''
-Some sites will replicate URLs across the page. This function removes them.
-Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
-if h2 exists in h1s, remove from h2s
-
-also check partial URLs (e.g. nytimes.com/story.html is the same as
-nytimes.com/story.html?var=x
-'''
-def removeDuplicates(h1s, h2s, h3s):
-    #Assume h1s is one element, and keep it
-
-    #remove h2 duplicates
-    removeArr=[]
-    for i in range(len(h2s)):
-        #check internally
-        for j in range(len(h2s)):
-            if i==j:
-                continue
-            else:
-                if h2s[i] in h2s[j]:
-                    removeArr.append(h2s[j])
-        #check against h1s
-        for k in range(len(h1s)):
-            if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
-                removeArr.append(h2s[i])
-    for x in removeArr:
-        h2s.remove(x)
-    
-    #remove h3 duplicates
-    removeArr=[]
-    for i in range(len(h3s)):
-        #check internally
-        for j in range(len(h3s)):
-            if i==j:
-                continue
-            else:
-                if h3s[i] in h3s[j]:
-                    removeArr.append(h3s[j])
-        #check against h1s and h2s
-        h1and2=h1s+h2s
-        for k in range(len(h1and2)):
-            if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
-                removeArr.append(h3s[i])
-    for x in removeArr:
-        if x in h3s:
-            h3s.remove(x)
-    
-
-    return h1s, h2s, h3s
-
-
-
-def removalNotification(source, title, reason, value):
-    logger.debug("""Story removed
-    SOURCE:\t{}
-    TITLE:\t{})
-    REASON:\t{}
-    VALUE:\t{}""".format(source, title, reason, value))
-
-
-def removeBadStoriesHelper(source, element, badStringList, article_tiers):
-    if badStringList is None:
-        return
-    for tier, articles in enumerate(article_tiers):
-        for idx, article in enumerate(articles):
-            if article is None:
-                logger.debug("None type found in removeBadStoriesHelper for {}".format(source.name))
-                break
-            for item in badStringList:
-                if item in getattr(article, element):
-                    article_tiers[tier].remove(article)
-                    # if it's in the h1 slot, bump up the
-                    # first h2 into the h1 slot
-                    if tier == 0 and len(article_tiers[1]) > 0:
-                        article_tiers[0].append(article_tiers[1][0])
-                        article_tiers[1].remove(article_tiers[1][0])
-                    removalNotification(source.name, article.title, element, item)
-
-
-def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
-
-    arr=[source.h1Arr, source.h2Arr, source.h3Arr]
-
-    removeBadStoriesHelper(source, "title", badTitleArr, arr)
-    removeBadStoriesHelper(source, "description", badDescArr, arr)
-    removeBadStoriesHelper(source, "author", badAuthorArr, arr)
-    removeBadStoriesHelper(source, "img", badImgArr, arr)
-    removeBadStoriesHelper(source, "url", badURLArr, arr)
-                    
-    return source
-
-
-
-
-def buildTheHill():
-    url='http://thehill.com'
-    name='The Hill'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    h1=content
-    h1=h1.split('<div class="headline-story-image">', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[url+h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<div class="section-top-content">', 1)[1]
-    h2=h2.split('</ul>', 1)[0]
-    while '<div class="top-story-item' in h2 and len(h2s)<4:
-        h2=h2.split('<div class="top-story-item', 1)[1]
-        x=h2.split('<a href="', 1)[1]
-        x=x.split('"', 1)[0]
-        h2s.append(url+x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('<div class="section-top-content">', 1)[1]
-    h3=h3.split('</ul>', 1)[0]
-    while '<div class="top-story-item small' in h3:
-        h3=h3.split('<div class="top-story-item small', 1)[1]
-        x=h3.split('<a href="', 1)[1]
-        x=x.split('"', 1)[0]
-        h3s.append(url+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    hil=buildNewsSource2(name, url, h1s, h2s, h3s)
-    hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
-
-    return hil
-
-
-
-
-
-def buildGuardian():
-    url='http://www.theguardian.com/us'
-    name='The Guardian US'
-
-
-    while True:
-        #DOWNLOAD HOMEPAGE CONTENT
-        content=urlToContent(url, 'utf8')
-        
-        #get main headline
-        h1=content
-        h1=h1.split('<h1', 1)[1]
-        h1=h1.split('<a href="', 1)[1]
-        h1=h1.split('"', 1)[0]
-
-        if h1!='https://www.theguardian.com/us':
-            break
-        else:
-            logger.debug('Guardian loop')
-        
-    h1s=[h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    #only the h1 and the two h2s have this, so split on it and grab
-    #the second two
-    h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
-    for x in h2:
-        if '<h2 class="fc-item__title"><a href="' in x:
-            x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
-            x=x.split('"', 1)[0]
-            h2s.append(x)
-        else:
-            break
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
-    h3=h3.split('<div class="fc-container__inner">', 1)[0]#'<div class="js-show-more-placeholder">', 1)[0]
-    #this story section goes on forever; just grab the first 5
-    while '<h2 class="fc-item__title"><a href="' in h3:
-        h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        h3s.append(x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    
-    gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
-    gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
-
-    return gdn
-
-
-
-def buildWashTimes():
-    url='http://www.washingtontimes.com/'
-    name='Washington Times'
-
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-    
-    #get main headline
-    h1=content
-    h1=h1.split('top-news', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-
-    h1s=[url+h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('class="top-news', 1)[1]
-    h2=h2.split('</article>', 1)[1] #end of top-news article
-    h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
-    h2=h2.split('<article>')[1:]
-    
-    for x in h2:
-        x=x.split('<a href="', 1)[1]
-        x=x.split('"', 1)[0]
-        h2s.append(url+x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('more-from desktop-only', 1)[1]
-    h3=h3.split('</section>', 1)[0]
-    h3=h3.split('<a href="')[1:]
-    
-    for x in h3:
-        x=x.split('"', 1)[0]
-        h3s.append(url+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
-    wat=buildNewsSource2(name, url, h1s, h2s, h3s)
-    wat=removeBadStories(wat, None, None, None, None)
-
-    return wat
-
-
-def buildCSM():
-    url='http://www.csmonitor.com/USA'
-    name='Christian Science Monitor'
-
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #this makes sure we don't get '/USA' in the URL twice
-    url=url.split('/USA')[0]
-    
-    #get main headline
-    h1=content
-    h1=h1.split('block-0-0', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-
-    h1s=[url+h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('block-1-0', 1)[1]
-    h2=h2.split('ui-section-middle', 1)[0]
-    h2=h2.split('<h3 class="story_headline">')[1:]
-    
-    for x in h2:
-        temp=x.split('<a href="', 2)[1:]
-        x=temp[0]
-        x=x.split('"', 1)[0]
-        if x=='/csmlists/special/first-look':
-            x=temp[1]
-            x=x.split('"', 1)[0]
-
-        h2s.append(url+x)
-    #also add in the floating story on the left
-    h2=content
-    h2=h2.split('block-0-1', 1)[1]
-    h2=h2.split('<h3 class="story_headline">')[1]
-    h2=h2.split('<a href="', 2)[2]
-    h2=h2.split('"', 1)[0]
-    h2s.append(url+h2)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('block-0-2', 1)[1]
-    h3=h3.split('ui-section-top-right', 1)[0]
-    h3=h3.split('<h3 class="story_headline')[1:]
-    
-    for x in h3:
-        x=x.split('<a href="', 2)[-1]
-        x=x.split('"', 1)[0]
-        h3s.append(url+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
-    csm=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    badTitleArr=['Change Agent']
-    badDescArr=None
-    badAuthorArr=None
-    badImgArr=['csm_logo']
-    badURLArr=['difference-maker']
-    csm=removeBadStories(csm, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
-    return csm
-
-
-
-'''
-Function to fix the oddly short og:descriptions provided
-in The Blaze articles by grabbing the first portion of the story instead
-'''
-def blazeFixDesc(articleArr):
-    TAG_RE = re.compile(r'<[^>]+>')
-    for i in range(len(articleArr)):
-        desc=urlToContent(articleArr[i].url)
-        desc=desc.split('<div class="entry-content article-styles">', 1)[1]
-        desc=desc.split('<p>', 1)[1]
-        desc=TAG_RE.sub('', desc)
-        desc=desc.replace('\n', ' ')
-        desc=desc[:144]
-        articleArr[i].description=desc
-
-    return articleArr
-    
-
-
-def buildBlaze():
-    url='http://theblaze.com'
-    name='The Blaze'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    h1=content
-    h1=h1.split('<!-- home -->', 1)[1]
-    h1=h1.split('<a class="gallery-link" href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[url+h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<!-- home -->', 1)[1]
-    h2=h2.split('<!-- loop-home -->', 1)[0]
-    while '<a class="gallery-link" href="' in h2:#'</figure>\n\n<figure class="gallery-item">' in h2:
-        h2=h2.split('<a class="gallery-link" href="', 1)[1]#'</figure>\n\n<figure class="gallery-item">', 1)[1]
-        #h2=h2.split('href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(url+x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('<!-- loop-home -->', 1)[1]
-    #this story section goes on forever; just grab the first 5
-    while len(h3s)<5:
-        h3=h3.split('<a class="feed-link" href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(url+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
-    blz=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    badTitleArr=['Tucker Carlson', 'Mark Levin']
-    badDescArr=['Lawrence Jones', 'Mike Slater']
-    badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson', 'Glenn Beck']
-    badImgArr=None
-    badURLArr=None
-    blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
-    
-    #The Blaze has dumb, short description fields, so we need to grab
-    #the first x characters of actual article text instead
-    blz.h1Arr=blazeFixDesc(blz.h1Arr)
-    blz.h2Arr=blazeFixDesc(blz.h2Arr)
-    blz.h3Arr=blazeFixDesc(blz.h3Arr)
-
-    return blz
-
-
-
-def buildCBS():
-    url='http://cbsnews.com'
-    name='CBS News'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    h1=content
-    if '<h1 class="title">' in content:
-        h1=h1.split('<h1 class="title">', 1)[1]
-        h1=h1.split('<a href="', 1)[1]
-        h1=h1.split('"', 1)[0]
-        h1s=[url+h1]
-    else:
-        #for cases where they lead with a video, pull the first h2 as h1
-        h1=h1.split('Big News Area Side Assets', 1)[1]
-        h1=h1.split('</ul></div>', 1)[0]
-        h1=h1.split('<li data-tb-region-item>', 1)[1]
-        h1=h1.split('<a href="', 1)[1]
-        x=h1.split('"', 1)[0]
-        h1s=[url+x]
-        
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('Big News Area Side Assets', 1)[1]
-    h2=h2.split('</ul></div>', 1)[0]
-    while '<li data-tb-region-item>' in h2:
-        h2=h2.split('<li data-tb-region-item>', 1)[1]
-        h2=h2.split('<a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(url+x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('Latest News', 1)[1]
-    #this story section goes on forever; just grab the first 5
-    while len(h3s)<5:
-        h3=h3.split('<li class="item-full-lead"', 1)[1]
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(url+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
-    cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
-
-    return cbs
-
-
-
-
-
-def buildNBC():    
-    url='http://nbcnews.com'
-    name='NBC News'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    h1=content
-    h1=h1.split('top-stories-section', 1)[1]
-    h1=h1.split('panel_hero', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    if '.com' not in h1:
-        h1=url+h1
-    h1s=[h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('ad-content ad-xs mobilebox1', 1)[1]
-    h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0]
-    while '<div class="story-link' in h2:
-        h2=h2.split('<div class="story-link', 1)[1]
-        h2=h2.split('<a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            if '.com' not in x:
-                x=url+x
-            h2s.append(x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('js-more-topstories', 1)[1]
-    h3=h3.split('<div class="panel-section', 1)[0]
-    while '<div class="story-link' in h3:
-        h3=h3.split('<div class="story-link', 1)[1]
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            if '.com' not in x:
-                x=url+x
-            h3s.append(x)
-
-    #adjust for today.com urls
-    '''
-    for arr in [h1s, h2s, h3s]:
-        for i in range(len(arr)):
-            if 'today.com' in arr[i]:
-                arr[i]=arr[i].split('.com', 1)[1]
-    '''
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
-    nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
-
-
-    return nbc
-
-
-
-
-def buildBBC():    
-    url='http://www.bbc.com/news/world/us_and_canada'
-    name='BBC US & Canada'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    h1=content
-    h1=h1.split('buzzard-item', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=['http://www.bbc.com'+h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<div class="pigeon">', 1)[1]
-    h2=h2.split('<div id=', 1)[0]
-    while 'top_stories#' in h2:
-        h2=h2.split('top_stories#', 1)[1]
-        h2=h2.split('<a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append('http://www.bbc.com'+x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('<div class="macaw">', 1)[1]
-    h3=h3.split('Watch/Listen', 1)[0]
-    while '<div class="macaw-item' in h3:
-        h3=h3.split('<div class="macaw-item', 1)[1]
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append('http://www.bbc.com'+x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
-    badTitleArr=None
-    badDescArr=None
-    badAuthorArr=None
-    badImgArr=['bbc_news_logo.png']
-    bbc=removeBadStories(bbc, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
-    
-    #REMOVE ' - BBC News' from headlines
-    for i in range(len(bbc.h1Arr)):
-        if ' - BBC News' in bbc.h1Arr[i].title:
-            bbc.h1Arr[i].title=bbc.h1Arr[i].title.split(' - BBC News', 1)[0]
-    for i in range(len(bbc.h2Arr)):
-        if ' - BBC News' in bbc.h2Arr[i].title:
-            bbc.h2Arr[i].title=bbc.h2Arr[i].title.split(' - BBC News', 1)[0]
-    for i in range(len(bbc.h3Arr)):
-        if ' - BBC News' in bbc.h3Arr[i].title:
-            bbc.h3Arr[i].title=bbc.h3Arr[i].title.split(' - BBC News', 1)[0]
-
-    return bbc
-
-
-
-def buildWeeklyStandard():
-    url='http://www.weeklystandard.com'
-    name='Weekly Standard'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-    
-    #get main headline
-    h1=content
-    h1=h1.split('<div id="region_1"', 1)[1]
-    h1=h1.split('<div id="region_2"', 1)[0]
-    h1=h1.split('<div class="lead-photo">', 1)[1]
-    h1=h1.split('href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<div class="widget lead-story layout-3col-feature" data-count="2">', 1)[1]
-    h2=h2.split('<div id="region_2"', 1)[0]
-    while '<div class="lead-photo">' in h2:
-        h2=h2.split('<div class="lead-photo">', 1)[1]
-        h2=h2.split('href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('Today\'s Standard', 1)[1]
-    h3=h3.split('<div id="region_3"', 1)[0]
-    while '<div class="lead-photo">' in h3:
-        h3=h3.split('<div class="lead-photo">', 1)[1]
-        h3=h3.split('href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(x)
-
-    #Need to add URL prefix to all URLs
-    for i in range(len(h1s)):
-        h1s[i]=url+h1s[i]
-    for i in range(len(h2s)):
-        h2s[i]=url+h2s[i]
-    for i in range(len(h3s)):
-        h3s[i]=url+h3s[i]
-        
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    #REMOVE BAD STORIES
-    badTitleArr=None
-    ## if flagged again, remove Micah Mattix
-    badDescArr=['Matt Labash']
-    badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY']
-    badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
-    wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
-    return wkl
-
-
-
-
-def buildNPR():
-    url='http://www.npr.org/sections/news/'
-    name='NPR'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-    
-    #get main headline
-    h1=content
-    h1=h1.split('<a id="mainContent">', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<article class="item has-image">', 1)[1]
-    h2=h2.split('<!-- END CLASS=\'FEATURED-3-UP\' -->', 1)[0]
-    while '<article class="item has-image">' in h2:
-        h2=h2.split('<article class="item has-image">', 1)[1]
-        h2=h2.split('<a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('<div id="overflow" class="list-overflow"', 1)[1]
-    h3=h3.split('<!-- END ID="OVERFLOW" CLASS="LIST-OVERFLOW"', 1)[0]
-    while '<h2 class="title"><a href="' in h3:
-        h3=h3.split('<h2 class="title"><a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(x)
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
-    npr=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    #REMOVE BAD STORIES
-    badTitleArr=['The Two-Way']
-    badDescArr=None
-    badAuthorArr=['Domenico Montanaro']
-    badImgArr=None
-    npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
-    return npr
-
-
-
-
-
-def buildABC():
-    url='http://www.abcnews.go.com'
-    name='ABC News'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-    
-    #get main headline
-    h1=content
-    h1=h1.split('id="row-1"', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[h1]
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('id="row-2"', 1)[1]
-    h2=h2.split('id="row-3"', 1)[0]
-    h2=h2.split('card single row-item')[1:3] #should just be 2 of these
-    for x in h2:
-        x=x.split('<a href="', 1)[1]
-        x=x.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(x)
-
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('id="row-1"', 1)[1]
-    h3=h3.split('tab-data active', 1)[1]
-    h3=h3.split('tab-data"', 1)[0] #note the trailing quotation
-    while '<a href="' in h3:
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(x)
-
-    h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
-    abc=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    #REMOVE BAD STORIES
-    badTitleArr=None
-    badDescArr=None
-    badAuthorArr=None
-    badImgArr=None
-    badURLArr=None
-    abc=removeBadStories(abc, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
-    return abc
-
-
-
-
-def buildFoxNews():
-    url = 'http://foxnews.com'
-    name = 'Fox News'
-
-    # DOWNLOAD HOMEPAGE CONTENT
-    content = urlToContent(url)
-    soup = BeautifulSoup(content, 'lxml')
-
-    # get main headline
-    h1 = soup.find('div', id='big-top')\
-             .find('div', class_='primary')\
-             .find('h1')\
-             .find('a')
-    h1 = h1['href']
-    h1s = [h1]
-    h1s = ['http:' + x if x.startswith('//') else x for x in h1s]
-
-    #GET SECONDARY HEADLINES
-    h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
-    h2s = [x['href'] for x in h2s]
-    h2s = ['http:' + x if x.startswith('//') else x for x in h2s]
-
-    #GET TERTIARY HEADLINES
-    h3s = []
-    for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
-        for li in ul.find_all('li', recursive=False):
-            h3s.append(li.find('a')['href'])
-    h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
-
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    fox=buildNewsSource2(name, url, h1s, h2s, h3s)
-
-    #REMOVE BAD STORIES
-    badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
-    badDescArr=['Sean Hannity']
-    badAuthorArr=['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz']
-    badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
-    badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
-    fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
-    return fox
-
-
-
-def buildNYT():
-    url='http://www.nytimes.com'
-    name='New York Times'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
-    #get main headline
-    #this will likely need if/else logic
-    h1=content
-
-    if 'story theme-summary banner' in h1:
-        #This is with a large headline over a and b columns
-        h1=h1.split('story theme-summary banner', 1)[1]
-        h1=h1.split('<a href="', 1)[1]
-        h1=h1.split('"', 1)[0]
-    else:
-        #otherwise, pull the first story from the A column
-        h1=h1.split('<div class="a-column column">', 1)[1]
-        h1=h1.split('<article class="story theme-summary lede"', 1)[1]
-        h1=h1.split('<a href="', 1)[1].split('"', 1)[0]
-    h1s=[h1]
-        
-
-    #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    #A column
-    h2=h2.split('<div class="a-column column">', 1)[1]
-    h2=h2.split('<!-- close a-column -->', 1)[0]
-    #remove "collection" sets
-    while '<div class="collection headlines">' in h2:
-        arr=h2.split('<div class="collection headlines">', 1)
-        h2=arr[0]+arr[1].split('</ul>', 1)[1]
-    #Grab the remaining URLs
-    while '<a href="' in h2:
-        h2=h2.split('<a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(x)
-
-    #GET TERTIARY HEADLINES
-    h3s=[]
-    #B column
-    h3=content
-    h3=h3.split('<div class="b-column column">', 1)[1]
-    h3=h3.split('<!-- close b-column -->', 1)[0]
-    #remove "collection" sets
-    while '<div class="collection headlines">' in h3:
-        arr=h3.split('<div class="collection headlines">', 1)
-        h3=arr[0]+arr[1].split('</ul>', 1)[1]
-    #Grab the remaining URLs
-    while '<a href="' in h3:
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if (h1 not in x) and (x not in h3s):
-            h3s.append(x)
-
-    '''
-    #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    if '<!-- close lede-package-region -->' in h3:
-        h3=h3.split('<!-- close lede-package-region -->', 1)[1]
-        h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
-    elif '/video/the-daily-360' in h3:
-        h3=h3.split('/video/the-daily-360')[-1]
-        h3=h3.split('More News', 1)[0]
-    #remove "collection" sets
-    while '<div class="collection headlines">' in h2:
-        arr=h3.split('<div class="collection headlines">', 1)
-        h3=arr[0]+arr[1].split('</ul>', 1)[1]
-    
-    #Grab the remaining URLs
-    while '<a href="' in h3:
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if (h1 not in x) and (x not in h3s):
-            h3s.append(x)
-    '''
-            
-    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
-    nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
-    nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
-
-    
-    return nyt
-
-
-
-
-'''
-NYT
-EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
-
-<div class="span-ab-layout layout">
-
-    <div class="ab-column column">
-
-        <section id="top-news" class="top-news">
-            <h2 class="section-heading visually-hidden">Top News</h2>
-
-                            <div class="above-banner-region region">
-
-                    <div class="collection">
-            <div class="hpHeader" id="top-megapackage-kicker">
-  <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
-</div>
-
-</div>
-
-                </div><!-- close above-banner-region -->
-            
-                            <div class="span-ab-top-region region">
-
-                    <div class="collection">
-            <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
-            <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
-</article>
-</div>
-
-                </div><!-- close span-ab-top-region -->
-'''
diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py
new file mode 100644
index 0000000..e4a473a
--- /dev/null
+++ b/unbiased/sources/__init__.py
@@ -0,0 +1,10 @@
+import importlib
+import pkgutil
+
+from unbiased.sources.base import NewsSource
+
+def get_sources():
+    for loader, name, is_pkg in pkgutil.walk_packages(__path__):
+        if name != 'base':
+            importlib.import_module('unbiased.sources.' + name)
+    return {x.shortname.lower(): x for x in NewsSource.__subclasses__()}
diff --git a/unbiased/sources/abc.py b/unbiased/sources/abc.py
new file mode 100644
index 0000000..d9092a2
--- /dev/null
+++ b/unbiased/sources/abc.py
@@ -0,0 +1,42 @@
+from unbiased.sources.base import NewsSource
+
+class ABC(NewsSource):
+
+    name = 'ABC News'
+    shortname = 'ABC'
+    url = 'http://abcnews.go.com/'
+
+    @classmethod
+    def _fetch_urls(cls):
+        """
+        Returns three tuples of urls, one for each of
+        the three tiers.
+        """
+        soup = cls._fetch_content(cls.url)
+
+        # get primary headline
+        h1 = soup.find('article', class_='hero')\
+            .find('div', class_='caption-wrapper').h1.a['href']
+        h1s = (h1,)
+
+        # get secondary headlines
+        h2s = soup.find('div', id='row-2')\
+            .find_all('article', class_='card single row-item')
+        h2s = tuple(x.find('div', class_='caption-wrapper').h1.a['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3s = soup.find('div', id='row-1')\
+            .find('article', class_='headlines')\
+            .find('div', id='tab-content')\
+            .find_all('li', class_=['story', 'wirestory'])
+        h3s = tuple(x.div.h1.a['href'] for x in h3s)
+
+        return h1s, h2s, h3s
+
+    @classmethod
+    def _normalize_url(cls, url):
+        """
+        ABC News urls include an 'id' query param that we need to
+        keep in order for the URL to work.
+        """
+        return NewsSource._normalize_url(url, ['id'])
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
new file mode 100644
index 0000000..9f51287
--- /dev/null
+++ b/unbiased/sources/base.py
@@ -0,0 +1,210 @@
+import collections
+import html
+import logging
+import urllib
+
+from bs4 import BeautifulSoup
+import requests
+
+logger = logging.getLogger('unbiased')
+
+class Article(object):
+
+    def __init__(self, source, title, author, description, url, img):
+        self.source = source
+        self.title = title
+        self.author = author
+        self.description = description
+        self.url = url
+        self.img = img
+
+    def __repr__(self):
+        return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img)
+
+class NewsSource(object):
+    """
+    Abstract base class.
+    To implement:
+     - set 'name', 'shortname', and 'url'
+     - set 'bad_' variables to blacklist terms and phrases
+     - implement '_fetch_urls()', which should return three tuples
+       of urls, one for each tier
+     - override any of the '_get_*()' functions as necessary
+    """
+
+    name = None
+    shortname = None
+    url = None
+
+    bad_titles = None
+    bad_authors = None
+    bad_descriptions = None
+    bad_imgs = None
+    bad_urls = None
+
+    def __init__(self, h1s, h2s, h3s):
+        self.h1s = h1s
+        self.h2s = h2s
+        self.h3s = h3s
+
+    @classmethod
+    def build(cls):
+        h1s, h2s, h3s = cls._fetch_urls()
+        logger.info('Fetched {} h1s, {} h2s, {} h3s'.format(len(h1s), len(h2s), len(h3s)))
+        h1s = tuple(cls._normalize_url(x) for x in h1s)
+        h2s = tuple(cls._normalize_url(x) for x in h2s)
+        h3s = tuple(cls._normalize_url(x) for x in h3s)
+        h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s)
+        h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s)
+        h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s)
+        return cls(h1s, h2s, h3s)
+
+    @classmethod
+    def _fetch_content(cls, url):
+        res = requests.get(url)
+        if res.status_code == 200:
+            content = res.text
+        else:
+            raise Exception("Failed to download {}".format(url))
+        return BeautifulSoup(content, 'lxml')
+
+    @classmethod
+    def _normalize_url(cls, url, keep_query_vars=None):
+        """
+        Make sure they have a scheme.
+        Make sure they have a host.
+        Trim any query string, params, or fragments.
+        """
+        cls_url = urllib.parse.urlparse(cls.url)
+        url = urllib.parse.urlparse(url)
+        if keep_query_vars is None:
+            query = ''
+        else:
+            query_vars = {}
+            qs = urllib.parse.parse_qs(url.query)
+            for v in keep_query_vars:
+                if v in qs:
+                    query_vars[v] = qs[v]
+            query_pairs = []
+            for k, i in query_vars.items():
+                for v in i:
+                    query_pairs.append('{}={}'.format(k, v))
+            query = '?'.join(query_pairs)
+        url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '')
+        return urllib.parse.urlunparse(url)
+
+    @classmethod
+    def _remove_duplicates(cls, h1s, h2s, h3s):
+        h2s = tuple(x for x in h2s if x not in h1s)
+        h3s = tuple(x for x in h3s if x not in h1s and x not in h2s)
+        return h1s, h2s, h3s
+
+    @classmethod
+    def _remove_bad_stories(cls, articles, element, filters):
+        # TODO: replace string filters with regex filters
+        if filters is None:
+            return articles
+        new_articles = []
+        for article in articles:
+            save = True
+            for f in filters:
+                if getattr(article, element) and f in getattr(article, element):
+                    save = False
+                    break
+            if save:
+                new_articles.append(article)
+        return tuple(new_articles)
+
+    @classmethod
+    def _remove_all_bad_stories(cls, h1s, h2s, h3s):
+        new_articles = []
+        for articles in [h1s, h2s, h3s]:
+            articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles)
+            articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions)
+            articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors)
+            articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs)
+            articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls)
+            new_articles.append(articles)
+        if len(new_articles[0]) == 0 and len(new_articles[1]) > 0:
+            new_articles[0] = new_articles[0] + new_articles[1][:1]
+            new_articles[1] = new_articles[1][1:]
+        return tuple(tuple(x) for x in new_articles)
+
+    @classmethod
+    def _fetch_articles(cls, h1s, h2s, h3s):
+        ret = []
+        for urls in [h1s, h2s, h3s]:
+            articles = []
+            for url in urls:
+                article = cls._fetch_article(url)
+                if article is not None:
+                    articles.append(article)
+            ret.append(articles)
+        return tuple(tuple(x) for x in ret)
+
+    @classmethod
+    def _fetch_article(cls, url):
+        logger.debug(cls.name)
+        logger.debug(url)
+
+        try:
+            soup = cls._fetch_content(url)
+        except Exception as ex:
+            logger.debug("""ARTICLE DOWNLOADING ERROR
+            SOURCE:\t{}
+            URL:\t{}""".format(cls.name, url))
+            return None
+
+        url_parts = urllib.parse.urlparse(url)
+        scheme = url_parts.scheme
+
+        try:
+            img = cls._get_image(soup)
+            img = urllib.parse.urlparse(img, scheme=scheme).geturl()
+            logger.debug(img)
+
+            title = cls._get_title(soup)
+            logger.debug(title)
+
+            author = cls._get_author(soup)
+            logger.debug(author)
+
+            description = cls._get_description(soup)
+            logger.debug(description)
+            description = cls._remove_self_refs(description)
+            logger.debug(description)
+        except Exception:
+            logger.debug("""ARTICLE PARSING ERROR
+            SOURCE:\t{}
+            URL:\t{}""".format(cls.name, url))
+            return None
+
+        return Article(cls.name, title, author, description, url, img)
+
+    @classmethod
+    def _get_image(cls, soup):
+        return soup.find('meta', property='og:image')['content']
+
+    @classmethod
+    def _get_title(cls, soup):
+        return soup.find('meta', property='og:title')['content']
+
+    @classmethod
+    def _get_author(cls, soup):
+        for author_tag in ['article:author', 'dc.creator', 'author']:
+            author = soup.find('meta', property=author_tag)
+            if author is None:
+                continue
+            return author['content']
+        return None
+
+    @classmethod
+    def _get_description(cls, soup):
+        return soup.find('meta', property='og:description')['content']
+
+    @classmethod
+    def _remove_self_refs(cls, description):
+        description = description.replace(cls.name + "'s", '***')
+        description = description.replace(cls.name + "'", '***')
+        description = description.replace(cls.name, '***')
+        return description
diff --git a/unbiased/sources/bbc.py b/unbiased/sources/bbc.py
new file mode 100644
index 0000000..0dd0f80
--- /dev/null
+++ b/unbiased/sources/bbc.py
@@ -0,0 +1,26 @@
+from unbiased.sources.base import NewsSource
+
+class BBC(NewsSource):
+
+    name = 'BBC News'
+    shortname = 'bbc'
+    url = 'http://www.bbc.com/news/world/us_and_canada'
+
+    bad_images = ['bbc_news_logo.png']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        h1s = soup.find('div', class_='buzzard-item')\
+                .find('a', class_='title-link')
+        h1s = (h1s['href'],)
+
+        h2s = soup.find_all('div', attrs={'class': 'pigeon__column', 'data-entityid': True})
+        h2s = tuple(x.find('a', class_='title-link')['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3s = soup.find_all('div', attrs={'class': 'macaw-item', 'data-entityid': True})
+        h3s = tuple(x.find('a', class_='title-link')['href'] for x in h3s)
+
+        return h1s, h2s, h3s
diff --git a/unbiased/sources/cbs.py b/unbiased/sources/cbs.py
new file mode 100644
index 0000000..295e671
--- /dev/null
+++ b/unbiased/sources/cbs.py
@@ -0,0 +1,37 @@
+from unbiased.sources.base import NewsSource
+
+class CBS(NewsSource):
+
+    name = 'CBS News'
+    shortname = 'cbs'
+    url = 'https://www.cbsnews.com/'
+
+    bad_titles = ['60 Minutes']
+    bad_descriptions = ['60 Minutes']
+    bad_urls = ['whats-in-the-news-coverart']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        # get primary headline
+        h1 = soup.find('h1', class_='title')
+        # sometimes they lead with a video
+        # if so, we'll pull the first h2 into the h1 slot later
+        if h1 is not None:
+            h1s = (h1.a['href'],)
+
+        # get secondary headlines
+        h2s = soup.find('div', attrs={'data-tb-region': 'Big News Area Side Assets'})\
+                .ul.find_all('li', attrs={'data-tb-region-item': True})
+        h2s = tuple(x.a['href'] for x in h2s)
+        if h1 is None:
+            h1s = (h2s[0],)
+            h2s = tuple(h2s[1:])
+
+        # get tertiary headlines
+        h3s = soup.find('div', attrs={'data-tb-region': 'Hard News'})\
+                .ul.find_all('li', attrs={'data-tb-region-item': True})
+        h3s = tuple(x.a['href'] for x in h3s[:5])
+
+        return h1s, h2s, h3s
diff --git a/unbiased/sources/csm.py b/unbiased/sources/csm.py
new file mode 100644
index 0000000..4e1eea5
--- /dev/null
+++ b/unbiased/sources/csm.py
@@ -0,0 +1,41 @@
+from unbiased.sources.base import NewsSource
+
+class CSM(NewsSource):
+
+    name = 'Christian Science Monitor'
+    shortname = 'csm'
+    url = 'https://www.csmonitor.com/USA'
+
+    bad_titles = ['Change Agent']
+    bad_imgs = ['csm_logo']
+    bad_urls = ['difference-maker']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        # get primary headline
+        h1 = soup.find('div', id='block-0-0')\
+                .find('h3', class_='story_headline')\
+                .a['href']
+        h1s = (h1,)
+
+        # get secondary headlines
+        h2_blocks = soup.find_all('div', id=['block-1-0', 'block-0-1'])
+        h2s = []
+        for block in h2_blocks:
+            hblocks = block.find_all('h3', class_='story_headline')
+            for hblock in hblocks:
+                h2s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']]
+        h2s = tuple(x['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3_blocks = soup.find_all('div', id='block-0-2')
+        h3s = []
+        for block in h3_blocks:
+            hblocks = block.find_all('h3', class_='story_headline')
+            for hblock in hblocks:
+                h3s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']]
+        h3s = tuple(x['href'] for x in h3s)
+
+        return h1s, h2s, h3s
diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py
new file mode 100644
index 0000000..ce7730f
--- /dev/null
+++ b/unbiased/sources/fox.py
@@ -0,0 +1,41 @@
+from unbiased.sources.base import NewsSource
+
+class Fox(NewsSource):
+
+    name = 'Fox News'
+    shortname = 'Fox'
+    url = 'http://www.foxnews.com'
+
+    bad_titles = ['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
+    bad_descriptions = ['Sean Hannity']
+    bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz']
+    bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
+    bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com']
+
+    @classmethod
+    def _fetch_urls(cls):
+        """
+        Returns three tuples of urls, one for each of
+        the three tiers.
+        """
+        soup = cls._fetch_content(cls.url)
+
+        # get primary headline
+        h1 = soup.find('div', id='big-top')\
+                 .find('div', class_='primary')\
+                 .find('h1')\
+                 .find('a')['href']
+        h1s = (h1,)
+
+        # get secondary headlines
+        h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
+        h2s = tuple(x['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3s = []
+        for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
+            for li in ul.find_all('li', recursive=False):
+                h3s.append(li.find('a')['href'])
+        h3s = tuple(h3s)
+
+        return h1s, h2s, h3s
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
new file mode 100644
index 0000000..5a1c3dd
--- /dev/null
+++ b/unbiased/sources/guardian.py
@@ -0,0 +1,37 @@
+import html
+
+from unbiased.sources.base import NewsSource
+
+class TheGuardian(NewsSource):
+
+    name = 'The Guardian'
+    shortname = 'Guardian'
+    url = 'https://www.theguardian.com/us'
+
+    bad_authors = ['Tom McCarthy', 'Andy Hunter']
+    bad_urls = ['https://www.theguardian.com/profile/ben-jacobs']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        url_groups = []
+        for htag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            hblocks = soup.find('section', id='headlines').find_all(htag)
+            urls = [x.a['href'] for x in hblocks]
+            url_groups.append(urls)
+        url_groups = [x for x in url_groups if len(url_groups) > 0]
+        if len(url_groups) < 3:
+            raise Exception('not enough article groups on Guardian home page!')
+
+        return tuple(url_groups[0]), tuple(url_groups[1]), tuple(url_groups[2])
+
+    @classmethod
+    def _get_image(cls, soup):
+        if soup.find('img', class_='maxed'):
+            img =  soup.find('img', class_='maxed')['src']
+        if soup.find('meta', itemprop='image'):
+            img = soup.find('meta', itemprop='image')['content']
+        if soup.find('img', class_='immersive-main-media__media'):
+            img = soup.find('img', class_='immersive-main-media__media')['src']
+        return html.unescape(img)
diff --git a/unbiased/sources/npr.py b/unbiased/sources/npr.py
new file mode 100644
index 0000000..e52459f
--- /dev/null
+++ b/unbiased/sources/npr.py
@@ -0,0 +1,29 @@
+from unbiased.sources.base import NewsSource
+
+class NPR(NewsSource):
+
+    name = 'NPR News'
+    shortname = 'npr'
+    url = 'http://www.npr.org/sections/news/'
+
+    bad_titles = ['The Two-Way']
+    bad_authors = ['Domenico Montanaro']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        featured = soup.find('div', class_='featured-3-up')\
+                .find_all('article', recursive=False)
+
+        h1s = featured[:1]
+        h1s = tuple(x.find('h2', class_='title').a['href'] for x in h1s)
+        h2s = featured[1:]
+        h2s = tuple(x.find('h2', class_='title').a['href'] for x in h2s)
+
+        # get tertiary headlines
+        h3s = soup.find('div', id='overflow')\
+                .find_all('article', recursive=False)
+        h3s = tuple(x.find('h2', class_='title').a['href'] for x in h3s[:5])
+
+        return h1s, h2s, h3s
diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py
new file mode 100644
index 0000000..862204e
--- /dev/null
+++ b/unbiased/sources/thehill.py
@@ -0,0 +1,39 @@
+
+from unbiased.sources.base import NewsSource
+
+class TheHill(NewsSource):
+
+    name = 'The Hill'
+    shortname = 'Hill'
+    url = 'http://thehill.com'
+
+    bad_titles = ['THE MEMO']
+    bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg']
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        h1 = soup.find('h1', class_='top-story-headline')\
+            .find('a')['href']
+        h1s = (h1,)
+
+        h23s = soup.find('div', class_='section-top-content')\
+                  .find_all('div', class_='top-story-item')
+        h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']])
+        h2s = tuple(h2s)
+
+        h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']])
+        h3s = tuple(h3s)
+
+        return h1s, h2s, h3s
+
+    @classmethod
+    def _get_description(cls, soup):
+        try:
+            return NewsSource._get_description(soup)
+        except Exception:
+            # fall back on grabbing text from the article
+            desc = soup.find('div', class_='field-items')
+            return desc.text[:200].rsplit(' ', 1)[0]
+
diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py
new file mode 100644
index 0000000..1be1838
--- /dev/null
+++ b/unbiased/sources/washtimes.py
@@ -0,0 +1,31 @@
+from unbiased.sources.base import NewsSource
+
+class TheWashingtonTimes(NewsSource):
+
+    name = 'The Washington Times'
+    shortname = 'WashTimes'
+    url = 'http://www.washingtontimes.com/'
+
+    @classmethod
+    def _fetch_urls(cls):
+        soup = cls._fetch_content(cls.url)
+
+        h1 = soup.find('article', class_='lead-story')\
+                .find(class_='article-headline')\
+                .a['href']
+        h1s = (h1,)
+
+        top_articles = soup.find('section', class_='top-news')\
+                .find_all('article', recursive=False)
+        h2s = []
+        for a in top_articles:
+            if a.attrs.get('class') is None:
+                h2s.append(a.a['href'])
+        h2s = tuple(h2s)
+
+        h3s = soup.find('section', class_='more-from desktop-only')\
+                .ul.find_all('a')
+        h3s = [x['href'] for x in h3s]
+        h3s = tuple(h3s)
+
+        return h1s, h2s, h3s
diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py
deleted file mode 100755
index 7ce50d3..0000000
--- a/unbiased/spotCheck.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-
-from unbiased.parser import *
-from unbiased.unbiasedObjects import *
-
-def spotCheck(src):
-
-    fns = {'hil' : buildTheHill,
-           'cbs' : buildCBS,
-           'npr' : buildNPR,
-           'fox' : buildFoxNews,
-           'gdn' : buildGuardian,
-           'blz' : buildBlaze,
-           'bbc' : buildBBC,
-           'nbc' : buildNBC,
-           'wat' : buildWashTimes,
-           'csm' : buildCSM,
-           'abc' : buildABC}
-
-    data=fns[src]()
-
-    print('H1s:\n--------------')
-    for h in data.h1Arr:
-        print(h.title)
-
-    print('\n\nH2s:\n--------------')
-    for h in data.h2Arr:
-        print(h.title)
-
-    print('\n\nH3s:\n--------------')
-    for h in data.h3Arr:
-        print(h.title)
-
-    print('\n\n')
-
-
-
-if __name__=='__main__':
-    spotCheck(sys.argv[1])
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
deleted file mode 100644
index 6ec89b7..0000000
--- a/unbiased/unbiasedFunctions.py
+++ /dev/null
@@ -1,277 +0,0 @@
-import html
-import io
-import logging
-import os
-import pkgutil
-import random
-import re
-import time
-import urllib.parse
-
-from PIL import Image
-import requests
-
-from unbiased.unbiasedObjects import *
-
-logger = logging.getLogger('unbiased')
-
-#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
-
-    debugging=False
-    if debugging:
-        logger.debug(sourceName)
-        logger.debug(url)
-
-    url_parts = urllib.parse.urlparse(url)
-    scheme = url_parts.scheme
-
-    #download url
-    try:
-        res = requests.get(url)
-    except Exception as ex:
-        logger.debug("""ARTICLE DOWNLOADING ERROR
-        SOURCE:\t{}
-        URL:\t{}""".format(sourceName, url))
-        return None
-
-    if res.status_code == 200:
-        content = res.text
-    else:
-        logger.debug("""ARTICLE DOWNLOADING ERROR
-        SOURCE:\t{}
-        URL:\t{}""".format(sourceName, url))
-        return None
-
-    try:
-        if sourceName=='The Guardian US':
-            #The Guardian puts an identifying banner on their og:images
-            #grab the main image from the page instead
-
-            #scenario 1: regular image
-            if '<img class="maxed' in content:
-                img=content.split('<img class="maxed', 1)[1]
-                img=img.split('src="', 1)[1].split('"')[0]
-            #scenario 2: video in image spot
-            elif '<meta itemprop="image"' in content:
-                img=content.split('<meta itemprop="image"', 1)[1]
-                img=img.split('content="', 1)[1].split('"')[0]
-            #scenario 3: photo essays
-            elif '<img class="immersive-main-media__media"' in content:
-                img=content.split('<img class="immersive-main-media__media"', 1)[1]
-                img=img.split('src="', 1)[1].split('"')[0]
-            img = html.unescape(img)
-
-        else:
-            if 'og:image' in content:
-                img=content.split('og:image" content=')[1][1:].split('>')[0]
-            elif sourceName=='ABC News':
-                img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
-            if img[-1]=='/':
-                #because the quote separator could be ' or ",
-                #trim to just before it then lop it off
-                img=img[:-1].strip()
-            img=img[:-1]
-        # fix the scheme if it's missing
-        img = urllib.parse.urlparse(img, scheme=scheme).geturl()
-
-        if debugging:
-            logger.debug(img)
-
-        title=content.split('og:title" content=')[1][1:].split('>')[0]
-        if title[-1]=='/':
-            title=title[:-1].strip()
-        title=title[:-1]
-
-        if debugging:
-            logger.debug(title)
-
-
-        author=''
-        if sourceName=='The Blaze':
-            if 'class="article-author">' in content:
-                author=content.split('class="article-author">')[1].split('<')[0]
-            elif 'class="article-author" href="' in content:
-                author=content.split('class="article-author" href="')[1]
-                author=author.split('>')[1].split('<')[0].strip()
-        else:
-            authorTags=['article:author', 'dc.creator', 'property="author']
-            for tag in authorTags:
-                if tag in content:
-                    author=content.split(tag+'" content=')[1][1:].split('>')[0]
-                    author=author[:-1]
-                    #trim an extra quotation mark for The Hill
-                    if sourceName=='The Hill':
-                        author=author.split('"', 1)[0]
-                    break
-
-        if debugging:
-            logger.debug(author)
-
-
-        if 'og:description' in content:
-            description=content.split('og:description" content=')[1][1:].split('>')[0]
-            if description[-1]=='/':
-                description=description[:-1].strip()
-            description=description[:-1]
-        else:
-            if sourceName=='The Hill':
-                description=content.split('div class="field-items"')[-1]
-                description=re.sub('<[^<]+?>', '', description)
-                description=description[1:200]
-            else:
-                logger.debug("SHOULDN'T GET HERE")
-
-        #strip out self-references
-        description=description.replace(sourceName+"'s", '***')
-        description=description.replace(sourceName+"'", '***')
-        description=description.replace(sourceName, '***')
-
-        if debugging:
-            logger.debug(description)
-
-
-        a=Article(html.unescape(title), url, img, html.unescape(description), sourceName, html.unescape(author))
-        return a
-
-    except Exception:
-        logger.debug("""ARTICLE PARSING ERROR
-        SOURCE:\t{}
-        URL:\t{}""".format(sourceName, url))
-        return None
-
-
-def pickStories(newsSourceArr):
-    # TODO: refactor to avoid infinite loops
-    #set the random order for sources
-    h1RandomSources=[]
-    guard = 0
-    while len(h1RandomSources)<4:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h1Arr)>0:
-            if x not in h1RandomSources:
-                h1RandomSources.append(x)
-        else:
-            logger.debug('No H1 stories in '+newsSourceArr[x].name)
-        guard += 1
-        if guard > 100:
-            return [], [], []
-
-    #For h2s and h3s, select N random sources (can repeat), then
-    #a non-repetitive random article from within
-    h2RandomPairs=[]
-    while len(h2RandomPairs) < 6:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h2Arr) > 0:
-            y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
-            pair=[x,y]
-            if not pair in h2RandomPairs:
-                h2RandomPairs.append(pair)
-        else:
-            logger.debug('No H2 stories in '+newsSourceArr[x].name)
-
-    h3RandomPairs=[]
-    while len(h3RandomPairs) < 12:
-        x=random.sample(range(len(newsSourceArr)), 1)[0]
-        if len(newsSourceArr[x].h3Arr) > 0:
-            y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
-            pair=[x,y]
-            if not pair in h3RandomPairs:
-                h3RandomPairs.append(pair)
-        else:
-            logger.debug('No H3 stories in '+newsSourceArr[x].name)
-
-    # collect articles for each section
-    image_index = 0
-
-    top_stories = []
-    for i in range(len(h1RandomSources)):
-        source=newsSourceArr[h1RandomSources[i]]
-        randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
-        article=source.h1Arr[randomArticle]
-        top_stories.append(article)
-
-    middle_stories = []
-    for i in range(len(h2RandomPairs)):
-        pair=h2RandomPairs[i]
-        article=newsSourceArr[pair[0]].h2Arr[pair[1]]
-        middle_stories.append(article)
-
-    bottom_stories = []
-    for i in range(len(h3RandomPairs)):
-        pair=h3RandomPairs[i]
-        article=newsSourceArr[pair[0]].h3Arr[pair[1]]
-        bottom_stories.append(article)
-
-    return top_stories, middle_stories, bottom_stories
-
-def buildOutput(top_stories, middle_stories, bottom_stories):
-    #read in the template html file
-    from jinja2 import Environment, PackageLoader, select_autoescape
-    env = Environment(
-        loader=PackageLoader('unbiased', 'html_template'),
-        autoescape=select_autoescape(['html', 'xml'])
-    )
-    template = env.get_template('unbiased.jinja.html')
-
-    timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
-    utime = int(time.time())
-
-    sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories]))
-
-    html = template.render(
-        timestamp = timestamp,
-        utime = utime,
-        top_stories = top_stories,
-        middle_stories = middle_stories,
-        bottom_stories = bottom_stories,
-        sources = sourcesStr,
-    )
-
-    #return updated text
-    return html
-
-def writeOutputHTML(outputHTML, outDir):
-    timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
-
-    with open(os.path.join(outDir, 'index.html'), 'w') as fp:
-        fp.write(outputHTML)
-
-    # copy over static package files
-    for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']:
-        data = pkgutil.get_data('unbiased', os.path.join('html_template', filename))
-        with open(os.path.join(outDir, filename), 'wb') as fp:
-            fp.write(data)
-
-def pullImage(url, index, webroot, target_width=350, target_height=200):
-    extension = url.split('.')[-1].split('?')[0]
-    img_name = 'img{}.{}'.format(index, extension)
-    res = requests.get(url)
-    if res.status_code == 200:
-        content = res.content
-    else:
-        logger.debug('Image not found: url={}'.format(url))
-        return ''
-    img = Image.open(io.BytesIO(content))
-    # crop to aspect ratio
-    target_ar = target_width / target_height
-    left, top, right, bottom = img.getbbox()
-    height = bottom - top
-    width = right - left
-    ar = width / height
-    if target_ar > ar:
-        new_height = (target_height / target_width) * width
-        bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
-        img = img.crop(bbox)
-    elif target_ar < ar:
-        new_width = (target_width / target_height) * height
-        bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
-        img = img.crop(bbox)
-    # resize if larger
-    if target_width * 2 < width or target_height * 2 < height:
-        img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
-    # TODO: create retina images
-    jpg_name = 'img{}.jpg'.format(index)
-    out_file = os.path.join(webroot, jpg_name)
-    img.save(out_file, 'JPEG')
-    return jpg_name
diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py
deleted file mode 100644
index 9a8a78a..0000000
--- a/unbiased/unbiasedObjects.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import logging
-
-logger = logging.getLogger('unbiased')
-
-class Article():
-    title=''
-    url=''
-    img=''
-    description=''
-    source=''
-    author=''
-
-    def __init__(self, title, url, img, description, source, author):
-        self.title=title
-        self.url=url
-        self.img=img
-        self.description=description
-        self.source=source
-        self.author=author
-
-    def __str__(self):
-        return '-----------\ntitle: {}\nauthor: {}\nsource: {}\ndescription: {}\nurl: {}\nimg: {}\n-----------'.format(self.title, self.author, self.source, self.description, self.url, self.img)
-
-    def __repr__(self):
-        return '{}({}, {}, {})'.format(self.source.replace(' ', ''), self.title, self.author, self.url)
-
-
-class NewsSource2():
-    name=''
-    url=''
-    h1Arr=[]
-    h2Arr=[]
-    h3Arr=[]
-    def __init__(self, name, url, h1Arr, h2Arr, h3Arr):
-        self.name=name
-        self.url=url
-        self.h1Arr=h1Arr
-        self.h2Arr=h2Arr
-        self.h3Arr=h3Arr
-        
-
-        
-class NewsSource():
-    name=''
-    url=''
-    #multiple start values to step through file. end value default to '"'
-    h1SectionDividerStart=None
-    h1SectionDividerEnd=None
-    h1DelStart=[]
-    h1DelEnd='"'
-    h2SectionDividerStart=None
-    h2SectionDividerEnd=None
-    h2DelStart=[]
-    h2DelEnd='"'
-    h3SectionDividerStart=None
-    h3SectionDividerEnd=None
-    h3DelStart=[]
-    h3DelEnd='"'
-    #arrays of Article object types
-    h1Arr=None
-    h2Arr=None
-    h3Arr=None
-    #url to attach to stub links
-    stubURL=''
-    
-    def __init__(self, name, url,
-                 h1DelStart, h2DelStart, h3DelStart,
-                 h1SectionDividerStart=None, h1SectionDividerEnd=None,
-                 h2SectionDividerStart=None, h2SectionDividerEnd=None,
-                 h3SectionDividerStart=None, h3SectionDividerEnd=None,
-                 stubURL=None):
-        self.name=name
-        self.url=url
-        self.h1DelStart=h1DelStart
-        self.h2DelStart=h2DelStart
-        self.h3DelStart=h3DelStart
-        self.h1SectionDividerStart=h1SectionDividerStart
-        self.h2SectionDividerStart=h2SectionDividerStart
-        self.h3SectionDividerStart=h3SectionDividerStart
-        self.h1SectionDividerEnd=h1SectionDividerEnd
-        self.h2SectionDividerEnd=h2SectionDividerEnd
-        self.h3SectionDividerEnd=h3SectionDividerEnd
-        self.h1Arr=[]
-        self.h2Arr=[]
-        self.h3Arr=[]
-        self.stubURL=stubURL
-
-    def addArticle(self, article, level):
-        if level==1:
-            self.h1Arr.append(article)
-        elif level==2:
-            self.h2Arr.append(article)
-        elif level==3:
-            self.h3Arr.append(article)
-        else:
-            logger.debug("Invalid level in NewsSource.addArtlce: " + level)
-
diff --git a/unbiased/util.py b/unbiased/util.py
new file mode 100644
index 0000000..12003b1
--- /dev/null
+++ b/unbiased/util.py
@@ -0,0 +1,113 @@
+import io
+import logging
+import os
+import pkgutil
+import random
+import shutil
+import time
+
+from PIL import Image
+import requests
+
+logger = logging.getLogger('unbiased')
+
+
+def pick_randoms(story_lists, length, per_source):
+    """
+    Return a randomly chosen list of 'length' stories, picking at
+    most 'per_source' stories from each source.
+    """
+    # TODO: weighting is incorrect if a source has fewer than 'per_source' articles
+    urandom = random.SystemRandom()
+    candidates = []
+    for stories in story_lists:
+        indexes = list(range(len(stories)))
+        urandom.shuffle(indexes)
+        random_indexes = indexes[:per_source]
+        candidates.extend([stories[x] for x in random_indexes])
+    indexes = list(range(len(candidates)))
+    urandom.shuffle(indexes)
+    random_indexes = indexes[:length]
+    return tuple(candidates[x] for x in random_indexes)
+
+
+def pickStories(newsSourceArr):
+    h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1)
+    h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2)
+    h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2)
+    return h1s, h2s, h3s
+
+
+def buildOutput(top_stories, middle_stories, bottom_stories):
+    # read in the template html file
+    from jinja2 import Environment, PackageLoader, select_autoescape
+    env = Environment(
+        loader=PackageLoader('unbiased', 'html_template'),
+        autoescape=select_autoescape(['html', 'xml'])
+    )
+    template = env.get_template('unbiased.jinja.html')
+
+    timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
+    utime = int(time.time())
+
+    sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories]))
+
+    html = template.render(
+        timestamp=timestamp,
+        utime=utime,
+        top_stories=top_stories,
+        middle_stories=middle_stories,
+        bottom_stories=bottom_stories,
+        sources=sourcesStr,
+    )
+
+    return html
+
+
+def write_files(files_to_write, outDir):
+    for name, bytesio in files_to_write.items():
+        with open(os.path.join(outDir, name), 'wb') as fp:
+            shutil.copyfileobj(bytesio, fp)
+
+
+def write_static_files(outDir):
+    # copy over static package files
+    for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']:
+        data = pkgutil.get_data('unbiased', os.path.join('html_template', filename))
+        with open(os.path.join(outDir, filename), 'wb') as fp:
+            fp.write(data)
+
+
+def pullImage(url, index, webroot, target_width=350, target_height=200):
+    res = requests.get(url)
+    if res.status_code == 200:
+        content = res.content
+    else:
+        logger.debug('Image not found: url={}'.format(url))
+        return ''
+    img = Image.open(io.BytesIO(content))
+    # crop to aspect ratio
+    target_ar = target_width / target_height
+    left, top, right, bottom = img.getbbox()
+    height = bottom - top
+    width = right - left
+    ar = width / height
+    if target_ar > ar:
+        new_height = (target_height / target_width) * width
+        bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
+        img = img.crop(bbox)
+    elif target_ar < ar:
+        new_width = (target_width / target_height) * height
+        bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
+        img = img.crop(bbox)
+    # resize if larger
+    if target_width * 2 < width or target_height * 2 < height:
+        img = img.resize((target_width * 2, target_height * 2), Image.LANCZOS)
+    # TODO: fill with a neutral color instead of just discarding alpha channel
+    img = img.convert('RGB')
+    # TODO: create retina images
+    jpg_name = 'img{}.jpg'.format(index)
+    jpg_file = io.BytesIO()
+    img.save(jpg_file, 'JPEG')
+    jpg_file.seek(0)
+    return jpg_name, jpg_file