diff options
author | Matt Singleton <matt@xcolour.net> | 2017-10-14 19:48:14 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-14 19:48:14 -0400 |
commit | 4317400d97abb928f584099516d8501c20f9d9b7 (patch) | |
tree | 52b28c0aeacb3547a1fbe2da50906d195023c35d | |
parent | 38a64b344bc6a25ce0faf17ddb7ed3439d0d007d (diff) | |
parent | 4b5f6067f05b5e35555f8e55219808470f9d664f (diff) |
Merge pull request #17 from sstvinc2/refactor-sources
Refactor sources
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | requirements.txt | 2 | ||||
-rwxr-xr-x | unbiased/main.py | 82 | ||||
-rwxr-xr-x | unbiased/parser.py | 986 | ||||
-rw-r--r-- | unbiased/sources/__init__.py | 10 | ||||
-rw-r--r-- | unbiased/sources/abc.py | 42 | ||||
-rw-r--r-- | unbiased/sources/base.py | 210 | ||||
-rw-r--r-- | unbiased/sources/bbc.py | 26 | ||||
-rw-r--r-- | unbiased/sources/cbs.py | 37 | ||||
-rw-r--r-- | unbiased/sources/csm.py | 41 | ||||
-rw-r--r-- | unbiased/sources/fox.py | 41 | ||||
-rw-r--r-- | unbiased/sources/guardian.py | 37 | ||||
-rw-r--r-- | unbiased/sources/npr.py | 29 | ||||
-rw-r--r-- | unbiased/sources/thehill.py | 39 | ||||
-rw-r--r-- | unbiased/sources/washtimes.py | 31 | ||||
-rwxr-xr-x | unbiased/spotCheck.py | 41 | ||||
-rw-r--r-- | unbiased/unbiasedFunctions.py | 277 | ||||
-rw-r--r-- | unbiased/unbiasedObjects.py | 97 | ||||
-rw-r--r-- | unbiased/util.py | 113 |
19 files changed, 694 insertions, 1448 deletions
@@ -13,3 +13,4 @@ venv/ unbiased.egg-info/ #* .#* +*.swp diff --git a/requirements.txt b/requirements.txt index 3767095..0d53cea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ beautifulsoup4~=4.6.0 Jinja2~=2.9.6 -lxml=~=3.8.0 +lxml~=3.8.0 Pillow~=4.2.1 requests~=2.18.4 diff --git a/unbiased/main.py b/unbiased/main.py index 4ccda24..19fd05b 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 import argparse +import io import logging import logging.config import time -from unbiased.unbiasedObjects import * -from unbiased.unbiasedFunctions import * -from unbiased.parser import * +from unbiased.util import pickStories, pullImage, buildOutput, write_files, write_static_files +from unbiased.sources import get_sources logger = logging.getLogger('unbiased') @@ -52,6 +52,7 @@ def main(): parser.add_argument('-l', '--log-dir', help='location to write detailed logs') parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode') parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit') + parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None) args = parser.parse_args() if args.log_dir: @@ -67,7 +68,7 @@ def main(): while True: logger.info('Starting crawl') start = time.time() - run(args.webroot) + run(args.webroot, args.sources, args.debug) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -77,51 +78,34 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot): - sources = [] - - ''' - SOURCES TO ADD NEXT: - -REUTERS - -Town Hall - ''' - - logger.debug('Running with webroot="{}"'.format(webroot)) - - ### These values have to be the second half of the function name - ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr = [ - 'Guardian', - 'TheHill', - 'NPR', - 'BBC', - 'NBC', - 'CBS', - 'FoxNews', - 'WashTimes', - 'CSM', - 'ABC', - ] - - for source in sourceFnArr: - logger.info('Crawling {}'.format(source)) +def run(webroot, source_names, debug_mode=False): + + logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names)) + + sources = get_sources() + if source_names is None: + sources = sources.values() + else: + sources = [sources[x] for x in source_names] + + built_sources = [] + for source in sources: + logger.info('Crawling {}'.format(source.name)) tries = 0 while tries < 3: time.sleep(tries) try: - fn = 'build' + source - possibles = globals().copy() - possibles.update(locals()) - method = possibles.get(fn) - src = method() - sources.append(src) + built_sources.append(source.build()) break except Exception as ex: + if debug_mode is True: + raise tries += 1 if tries == 3: - logger.error('Build failed. source={} ex={}'.format(source, ex)) + logger.error('Build failed. source={} ex={}'.format(source.name, ex)) else: - logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex)) + sources = tuple(built_sources) logger.info('Parsed home pages for: {}'.format([x.name for x in sources])) top_stories, middle_stories, bottom_stories = pickStories(sources) @@ -129,20 +113,26 @@ def run(webroot): logger.info('Picked middle stories from: {}'.format([x.source for x in middle_stories])) logger.info('Picked bottom stories from: {}'.format([x.source for x in bottom_stories])) + files_to_write = {} + # download images img_idx = 0 for story in top_stories: - story.img = pullImage(story.img, img_idx, webroot, 350, 200) + story.img, img_jpg = pullImage(story.img, img_idx, webroot, 350, 200) + files_to_write[story.img] = img_jpg img_idx += 1 for story in middle_stories: - story.img = pullImage(story.img, img_idx, webroot, 150, 100) + story.img, img_jpg = pullImage(story.img, img_idx, webroot, 150, 100) + files_to_write[story.img] = img_jpg img_idx += 1 - #build the output file HTML - outputHTML = buildOutput(top_stories, middle_stories, bottom_stories) + # build the output file HTML + output_html = buildOutput(top_stories, middle_stories, bottom_stories) + output_html = io.BytesIO(output_html.encode('utf8')) + files_to_write['index.html'] = output_html - #print the output file HTML - writeOutputHTML(outputHTML, webroot) + write_files(files_to_write, webroot) + write_static_files(webroot) if __name__=="__main__": main() diff --git a/unbiased/parser.py b/unbiased/parser.py deleted file mode 100755 index 399e0f2..0000000 --- a/unbiased/parser.py +++ /dev/null @@ -1,986 +0,0 @@ -#!/usr/bin/env python3
-
-import logging
-import os
-import re
-import urllib.parse
-
-from bs4 import BeautifulSoup
-import requests
-
-from unbiased.unbiasedObjects import *
-from unbiased.unbiasedFunctions import buildArticle
-
-logger = logging.getLogger('unbiased')
-
-
-'''
-Takes in a URL, downloads the file to a temp file,
-reads the file into a string, and returns that string
-'''
-def urlToContent(url, sourceEncoding='utf8'):
- res = requests.get(url)
- if res.status_code == 200:
- return res.text
- else:
- raise Exception("Failed to download {}".format(url))
-
-
-'''
-Creates a new newsSource2 object. For each URL in h1-h3URLs,
-calls the file scraper and appends the new Article object.
-Returns a newsSource2 object
-'''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
-
- url_parts = urllib.parse.urlparse(url)
- scheme = url_parts.scheme
- h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs]
- h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs]
- h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
-
- h1Arr=[]
- a=buildArticle(h1URLs[0], name)
- if a==None:
- logger.debug('H1 Nonetype in '+name)
- else:
- h1Arr.append(a)
-
- h2Arr=[]
- for x in h2URLs:
- a=buildArticle(x, name)
- if a!=None:
- h2Arr.append(a)
- else:
- logger.debug('H2 Nonetype in '+name)
-
- h3Arr=[]
- for x in h3URLs:
- a=buildArticle(x, name)
- if a!=None:
- h3Arr.append(a)
- else:
- logger.debug('H3 Nonetype in '+name)
-
- #BUILD THE NEWS SOURCE
- newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
-
- return newsSource
-
-
-'''
-Some sites will replicate URLs across the page. This function removes them.
-Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
-if h2 exists in h1s, remove from h2s
-
-also check partial URLs (e.g. nytimes.com/story.html is the same as
-nytimes.com/story.html?var=x
-'''
-def removeDuplicates(h1s, h2s, h3s):
- #Assume h1s is one element, and keep it
-
- #remove h2 duplicates
- removeArr=[]
- for i in range(len(h2s)):
- #check internally
- for j in range(len(h2s)):
- if i==j:
- continue
- else:
- if h2s[i] in h2s[j]:
- removeArr.append(h2s[j])
- #check against h1s
- for k in range(len(h1s)):
- if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
- removeArr.append(h2s[i])
- for x in removeArr:
- h2s.remove(x)
-
- #remove h3 duplicates
- removeArr=[]
- for i in range(len(h3s)):
- #check internally
- for j in range(len(h3s)):
- if i==j:
- continue
- else:
- if h3s[i] in h3s[j]:
- removeArr.append(h3s[j])
- #check against h1s and h2s
- h1and2=h1s+h2s
- for k in range(len(h1and2)):
- if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
- removeArr.append(h3s[i])
- for x in removeArr:
- if x in h3s:
- h3s.remove(x)
-
-
- return h1s, h2s, h3s
-
-
-
-def removalNotification(source, title, reason, value):
- logger.debug("""Story removed
- SOURCE:\t{}
- TITLE:\t{})
- REASON:\t{}
- VALUE:\t{}""".format(source, title, reason, value))
-
-
-def removeBadStoriesHelper(source, element, badStringList, article_tiers):
- if badStringList is None:
- return
- for tier, articles in enumerate(article_tiers):
- for idx, article in enumerate(articles):
- if article is None:
- logger.debug("None type found in removeBadStoriesHelper for {}".format(source.name))
- break
- for item in badStringList:
- if item in getattr(article, element):
- article_tiers[tier].remove(article)
- # if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if tier == 0 and len(article_tiers[1]) > 0:
- article_tiers[0].append(article_tiers[1][0])
- article_tiers[1].remove(article_tiers[1][0])
- removalNotification(source.name, article.title, element, item)
-
-
-def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
-
- arr=[source.h1Arr, source.h2Arr, source.h3Arr]
-
- removeBadStoriesHelper(source, "title", badTitleArr, arr)
- removeBadStoriesHelper(source, "description", badDescArr, arr)
- removeBadStoriesHelper(source, "author", badAuthorArr, arr)
- removeBadStoriesHelper(source, "img", badImgArr, arr)
- removeBadStoriesHelper(source, "url", badURLArr, arr)
-
- return source
-
-
-
-
-def buildTheHill():
- url='http://thehill.com'
- name='The Hill'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('<div class="headline-story-image">', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[url+h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('<div class="section-top-content">', 1)[1]
- h2=h2.split('</ul>', 1)[0]
- while '<div class="top-story-item' in h2 and len(h2s)<4:
- h2=h2.split('<div class="top-story-item', 1)[1]
- x=h2.split('<a href="', 1)[1]
- x=x.split('"', 1)[0]
- h2s.append(url+x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('<div class="section-top-content">', 1)[1]
- h3=h3.split('</ul>', 1)[0]
- while '<div class="top-story-item small' in h3:
- h3=h3.split('<div class="top-story-item small', 1)[1]
- x=h3.split('<a href="', 1)[1]
- x=x.split('"', 1)[0]
- h3s.append(url+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- hil=buildNewsSource2(name, url, h1s, h2s, h3s)
- hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
-
- return hil
-
-
-
-
-
-def buildGuardian():
- url='http://www.theguardian.com/us'
- name='The Guardian US'
-
-
- while True:
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, 'utf8')
-
- #get main headline
- h1=content
- h1=h1.split('<h1', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
-
- if h1!='https://www.theguardian.com/us':
- break
- else:
- logger.debug('Guardian loop')
-
- h1s=[h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- #only the h1 and the two h2s have this, so split on it and grab
- #the second two
- h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
- for x in h2:
- if '<h2 class="fc-item__title"><a href="' in x:
- x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
- x=x.split('"', 1)[0]
- h2s.append(x)
- else:
- break
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
- h3=h3.split('<div class="fc-container__inner">', 1)[0]#'<div class="js-show-more-placeholder">', 1)[0]
- #this story section goes on forever; just grab the first 5
- while '<h2 class="fc-item__title"><a href="' in h3:
- h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
- x=h3.split('"', 1)[0]
- h3s.append(x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
- gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
-
- return gdn
-
-
-
-def buildWashTimes():
- url='http://www.washingtontimes.com/'
- name='Washington Times'
-
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('top-news', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
-
- h1s=[url+h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('class="top-news', 1)[1]
- h2=h2.split('</article>', 1)[1] #end of top-news article
- h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
- h2=h2.split('<article>')[1:]
-
- for x in h2:
- x=x.split('<a href="', 1)[1]
- x=x.split('"', 1)[0]
- h2s.append(url+x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('more-from desktop-only', 1)[1]
- h3=h3.split('</section>', 1)[0]
- h3=h3.split('<a href="')[1:]
-
- for x in h3:
- x=x.split('"', 1)[0]
- h3s.append(url+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- wat=buildNewsSource2(name, url, h1s, h2s, h3s)
- wat=removeBadStories(wat, None, None, None, None)
-
- return wat
-
-
-def buildCSM():
- url='http://www.csmonitor.com/USA'
- name='Christian Science Monitor'
-
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #this makes sure we don't get '/USA' in the URL twice
- url=url.split('/USA')[0]
-
- #get main headline
- h1=content
- h1=h1.split('block-0-0', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
-
- h1s=[url+h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('block-1-0', 1)[1]
- h2=h2.split('ui-section-middle', 1)[0]
- h2=h2.split('<h3 class="story_headline">')[1:]
-
- for x in h2:
- temp=x.split('<a href="', 2)[1:]
- x=temp[0]
- x=x.split('"', 1)[0]
- if x=='/csmlists/special/first-look':
- x=temp[1]
- x=x.split('"', 1)[0]
-
- h2s.append(url+x)
- #also add in the floating story on the left
- h2=content
- h2=h2.split('block-0-1', 1)[1]
- h2=h2.split('<h3 class="story_headline">')[1]
- h2=h2.split('<a href="', 2)[2]
- h2=h2.split('"', 1)[0]
- h2s.append(url+h2)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('block-0-2', 1)[1]
- h3=h3.split('ui-section-top-right', 1)[0]
- h3=h3.split('<h3 class="story_headline')[1:]
-
- for x in h3:
- x=x.split('<a href="', 2)[-1]
- x=x.split('"', 1)[0]
- h3s.append(url+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- csm=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- badTitleArr=['Change Agent']
- badDescArr=None
- badAuthorArr=None
- badImgArr=['csm_logo']
- badURLArr=['difference-maker']
- csm=removeBadStories(csm, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
- return csm
-
-
-
-'''
-Function to fix the oddly short og:descriptions provided
-in The Blaze articles by grabbing the first portion of the story instead
-'''
-def blazeFixDesc(articleArr):
- TAG_RE = re.compile(r'<[^>]+>')
- for i in range(len(articleArr)):
- desc=urlToContent(articleArr[i].url)
- desc=desc.split('<div class="entry-content article-styles">', 1)[1]
- desc=desc.split('<p>', 1)[1]
- desc=TAG_RE.sub('', desc)
- desc=desc.replace('\n', ' ')
- desc=desc[:144]
- articleArr[i].description=desc
-
- return articleArr
-
-
-
-def buildBlaze():
- url='http://theblaze.com'
- name='The Blaze'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('<!-- home -->', 1)[1]
- h1=h1.split('<a class="gallery-link" href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[url+h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('<!-- home -->', 1)[1]
- h2=h2.split('<!-- loop-home -->', 1)[0]
- while '<a class="gallery-link" href="' in h2:#'</figure>\n\n<figure class="gallery-item">' in h2:
- h2=h2.split('<a class="gallery-link" href="', 1)[1]#'</figure>\n\n<figure class="gallery-item">', 1)[1]
- #h2=h2.split('href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append(url+x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('<!-- loop-home -->', 1)[1]
- #this story section goes on forever; just grab the first 5
- while len(h3s)<5:
- h3=h3.split('<a class="feed-link" href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(url+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- blz=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- badTitleArr=['Tucker Carlson', 'Mark Levin']
- badDescArr=['Lawrence Jones', 'Mike Slater']
- badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson', 'Glenn Beck']
- badImgArr=None
- badURLArr=None
- blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
-
- #The Blaze has dumb, short description fields, so we need to grab
- #the first x characters of actual article text instead
- blz.h1Arr=blazeFixDesc(blz.h1Arr)
- blz.h2Arr=blazeFixDesc(blz.h2Arr)
- blz.h3Arr=blazeFixDesc(blz.h3Arr)
-
- return blz
-
-
-
-def buildCBS():
- url='http://cbsnews.com'
- name='CBS News'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- if '<h1 class="title">' in content:
- h1=h1.split('<h1 class="title">', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[url+h1]
- else:
- #for cases where they lead with a video, pull the first h2 as h1
- h1=h1.split('Big News Area Side Assets', 1)[1]
- h1=h1.split('</ul></div>', 1)[0]
- h1=h1.split('<li data-tb-region-item>', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- x=h1.split('"', 1)[0]
- h1s=[url+x]
-
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('Big News Area Side Assets', 1)[1]
- h2=h2.split('</ul></div>', 1)[0]
- while '<li data-tb-region-item>' in h2:
- h2=h2.split('<li data-tb-region-item>', 1)[1]
- h2=h2.split('<a href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append(url+x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('Latest News', 1)[1]
- #this story section goes on forever; just grab the first 5
- while len(h3s)<5:
- h3=h3.split('<li class="item-full-lead"', 1)[1]
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(url+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
- cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
-
- return cbs
-
-
-
-
-
-def buildNBC():
- url='http://nbcnews.com'
- name='NBC News'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('top-stories-section', 1)[1]
- h1=h1.split('panel_hero', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- if '.com' not in h1:
- h1=url+h1
- h1s=[h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('ad-content ad-xs mobilebox1', 1)[1]
- h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0]
- while '<div class="story-link' in h2:
- h2=h2.split('<div class="story-link', 1)[1]
- h2=h2.split('<a href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- if '.com' not in x:
- x=url+x
- h2s.append(x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('js-more-topstories', 1)[1]
- h3=h3.split('<div class="panel-section', 1)[0]
- while '<div class="story-link' in h3:
- h3=h3.split('<div class="story-link', 1)[1]
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- if '.com' not in x:
- x=url+x
- h3s.append(x)
-
- #adjust for today.com urls
- '''
- for arr in [h1s, h2s, h3s]:
- for i in range(len(arr)):
- if 'today.com' in arr[i]:
- arr[i]=arr[i].split('.com', 1)[1]
- '''
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
- nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
-
-
- return nbc
-
-
-
-
-def buildBBC():
- url='http://www.bbc.com/news/world/us_and_canada'
- name='BBC US & Canada'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('buzzard-item', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=['http://www.bbc.com'+h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('<div class="pigeon">', 1)[1]
- h2=h2.split('<div id=', 1)[0]
- while 'top_stories#' in h2:
- h2=h2.split('top_stories#', 1)[1]
- h2=h2.split('<a href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append('http://www.bbc.com'+x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('<div class="macaw">', 1)[1]
- h3=h3.split('Watch/Listen', 1)[0]
- while '<div class="macaw-item' in h3:
- h3=h3.split('<div class="macaw-item', 1)[1]
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append('http://www.bbc.com'+x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
- badTitleArr=None
- badDescArr=None
- badAuthorArr=None
- badImgArr=['bbc_news_logo.png']
- bbc=removeBadStories(bbc, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
-
- #REMOVE ' - BBC News' from headlines
- for i in range(len(bbc.h1Arr)):
- if ' - BBC News' in bbc.h1Arr[i].title:
- bbc.h1Arr[i].title=bbc.h1Arr[i].title.split(' - BBC News', 1)[0]
- for i in range(len(bbc.h2Arr)):
- if ' - BBC News' in bbc.h2Arr[i].title:
- bbc.h2Arr[i].title=bbc.h2Arr[i].title.split(' - BBC News', 1)[0]
- for i in range(len(bbc.h3Arr)):
- if ' - BBC News' in bbc.h3Arr[i].title:
- bbc.h3Arr[i].title=bbc.h3Arr[i].title.split(' - BBC News', 1)[0]
-
- return bbc
-
-
-
-def buildWeeklyStandard():
- url='http://www.weeklystandard.com'
- name='Weekly Standard'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('<div id="region_1"', 1)[1]
- h1=h1.split('<div id="region_2"', 1)[0]
- h1=h1.split('<div class="lead-photo">', 1)[1]
- h1=h1.split('href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('<div class="widget lead-story layout-3col-feature" data-count="2">', 1)[1]
- h2=h2.split('<div id="region_2"', 1)[0]
- while '<div class="lead-photo">' in h2:
- h2=h2.split('<div class="lead-photo">', 1)[1]
- h2=h2.split('href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append(x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('Today\'s Standard', 1)[1]
- h3=h3.split('<div id="region_3"', 1)[0]
- while '<div class="lead-photo">' in h3:
- h3=h3.split('<div class="lead-photo">', 1)[1]
- h3=h3.split('href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(x)
-
- #Need to add URL prefix to all URLs
- for i in range(len(h1s)):
- h1s[i]=url+h1s[i]
- for i in range(len(h2s)):
- h2s[i]=url+h2s[i]
- for i in range(len(h3s)):
- h3s[i]=url+h3s[i]
-
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- #REMOVE BAD STORIES
- badTitleArr=None
- ## if flagged again, remove Micah Mattix
- badDescArr=['Matt Labash']
- badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY']
- badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
- wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
- return wkl
-
-
-
-
-def buildNPR():
- url='http://www.npr.org/sections/news/'
- name='NPR'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('<a id="mainContent">', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('<article class="item has-image">', 1)[1]
- h2=h2.split('<!-- END CLASS=\'FEATURED-3-UP\' -->', 1)[0]
- while '<article class="item has-image">' in h2:
- h2=h2.split('<article class="item has-image">', 1)[1]
- h2=h2.split('<a href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append(x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('<div id="overflow" class="list-overflow"', 1)[1]
- h3=h3.split('<!-- END ID="OVERFLOW" CLASS="LIST-OVERFLOW"', 1)[0]
- while '<h2 class="title"><a href="' in h3:
- h3=h3.split('<h2 class="title"><a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(x)
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- npr=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- #REMOVE BAD STORIES
- badTitleArr=['The Two-Way']
- badDescArr=None
- badAuthorArr=['Domenico Montanaro']
- badImgArr=None
- npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
-
- return npr
-
-
-
-
-
-def buildABC():
- url='http://www.abcnews.go.com'
- name='ABC News'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- h1=content
- h1=h1.split('id="row-1"', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- h1s=[h1]
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- h2=h2.split('id="row-2"', 1)[1]
- h2=h2.split('id="row-3"', 1)[0]
- h2=h2.split('card single row-item')[1:3] #should just be 2 of these
- for x in h2:
- x=x.split('<a href="', 1)[1]
- x=x.split('"', 1)[0]
- if h1 not in x:
- h2s.append(x)
-
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- h3=h3.split('id="row-1"', 1)[1]
- h3=h3.split('tab-data active', 1)[1]
- h3=h3.split('tab-data"', 1)[0] #note the trailing quotation
- while '<a href="' in h3:
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(x)
-
- h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
- abc=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- #REMOVE BAD STORIES
- badTitleArr=None
- badDescArr=None
- badAuthorArr=None
- badImgArr=None
- badURLArr=None
- abc=removeBadStories(abc, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
- return abc
-
-
-
-
-def buildFoxNews():
- url = 'http://foxnews.com'
- name = 'Fox News'
-
- # DOWNLOAD HOMEPAGE CONTENT
- content = urlToContent(url)
- soup = BeautifulSoup(content, 'lxml')
-
- # get main headline
- h1 = soup.find('div', id='big-top')\
- .find('div', class_='primary')\
- .find('h1')\
- .find('a')
- h1 = h1['href']
- h1s = [h1]
- h1s = ['http:' + x if x.startswith('//') else x for x in h1s]
-
- #GET SECONDARY HEADLINES
- h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
- h2s = [x['href'] for x in h2s]
- h2s = ['http:' + x if x.startswith('//') else x for x in h2s]
-
- #GET TERTIARY HEADLINES
- h3s = []
- for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
- for li in ul.find_all('li', recursive=False):
- h3s.append(li.find('a')['href'])
- h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- fox=buildNewsSource2(name, url, h1s, h2s, h3s)
-
- #REMOVE BAD STORIES
- badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker']
- badDescArr=['Sean Hannity']
- badAuthorArr=['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz']
- badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
- badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
- fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
-
- return fox
-
-
-
-def buildNYT():
- url='http://www.nytimes.com'
- name='New York Times'
-
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
- #get main headline
- #this will likely need if/else logic
- h1=content
-
- if 'story theme-summary banner' in h1:
- #This is with a large headline over a and b columns
- h1=h1.split('story theme-summary banner', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
- else:
- #otherwise, pull the first story from the A column
- h1=h1.split('<div class="a-column column">', 1)[1]
- h1=h1.split('<article class="story theme-summary lede"', 1)[1]
- h1=h1.split('<a href="', 1)[1].split('"', 1)[0]
- h1s=[h1]
-
-
- #GET SECONDARY HEADLINES
- h2=content
- h2s=[]
- #A column
- h2=h2.split('<div class="a-column column">', 1)[1]
- h2=h2.split('<!-- close a-column -->', 1)[0]
- #remove "collection" sets
- while '<div class="collection headlines">' in h2:
- arr=h2.split('<div class="collection headlines">', 1)
- h2=arr[0]+arr[1].split('</ul>', 1)[1]
- #Grab the remaining URLs
- while '<a href="' in h2:
- h2=h2.split('<a href="', 1)[1]
- x=h2.split('"', 1)[0]
- if h1 not in x:
- h2s.append(x)
-
- #GET TERTIARY HEADLINES
- h3s=[]
- #B column
- h3=content
- h3=h3.split('<div class="b-column column">', 1)[1]
- h3=h3.split('<!-- close b-column -->', 1)[0]
- #remove "collection" sets
- while '<div class="collection headlines">' in h3:
- arr=h3.split('<div class="collection headlines">', 1)
- h3=arr[0]+arr[1].split('</ul>', 1)[1]
- #Grab the remaining URLs
- while '<a href="' in h3:
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if (h1 not in x) and (x not in h3s):
- h3s.append(x)
-
- '''
- #GET TERTIARY HEADLINES
- h3=content
- h3s=[]
- if '<!-- close lede-package-region -->' in h3:
- h3=h3.split('<!-- close lede-package-region -->', 1)[1]
- h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
- elif '/video/the-daily-360' in h3:
- h3=h3.split('/video/the-daily-360')[-1]
- h3=h3.split('More News', 1)[0]
- #remove "collection" sets
- while '<div class="collection headlines">' in h2:
- arr=h3.split('<div class="collection headlines">', 1)
- h3=arr[0]+arr[1].split('</ul>', 1)[1]
-
- #Grab the remaining URLs
- while '<a href="' in h3:
- h3=h3.split('<a href="', 1)[1]
- x=h3.split('"', 1)[0]
- if (h1 not in x) and (x not in h3s):
- h3s.append(x)
- '''
-
- h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
- nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
- nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
-
-
- return nyt
-
-
-
-
-'''
-NYT
-EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
-
-<div class="span-ab-layout layout">
-
- <div class="ab-column column">
-
- <section id="top-news" class="top-news">
- <h2 class="section-heading visually-hidden">Top News</h2>
-
- <div class="above-banner-region region">
-
- <div class="collection">
- <div class="hpHeader" id="top-megapackage-kicker">
- <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
-</div>
-
-</div>
-
- </div><!-- close above-banner-region -->
-
- <div class="span-ab-top-region region">
-
- <div class="collection">
- <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
- <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
-</article>
-</div>
-
- </div><!-- close span-ab-top-region -->
-'''
diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py new file mode 100644 index 0000000..e4a473a --- /dev/null +++ b/unbiased/sources/__init__.py @@ -0,0 +1,10 @@ +import importlib +import pkgutil + +from unbiased.sources.base import NewsSource + +def get_sources(): + for loader, name, is_pkg in pkgutil.walk_packages(__path__): + if name != 'base': + importlib.import_module('unbiased.sources.' + name) + return {x.shortname.lower(): x for x in NewsSource.__subclasses__()} diff --git a/unbiased/sources/abc.py b/unbiased/sources/abc.py new file mode 100644 index 0000000..d9092a2 --- /dev/null +++ b/unbiased/sources/abc.py @@ -0,0 +1,42 @@ +from unbiased.sources.base import NewsSource + +class ABC(NewsSource): + + name = 'ABC News' + shortname = 'ABC' + url = 'http://abcnews.go.com/' + + @classmethod + def _fetch_urls(cls): + """ + Returns three tuples of urls, one for each of + the three tiers. + """ + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('article', class_='hero')\ + .find('div', class_='caption-wrapper').h1.a['href'] + h1s = (h1,) + + # get secondary headlines + h2s = soup.find('div', id='row-2')\ + .find_all('article', class_='card single row-item') + h2s = tuple(x.find('div', class_='caption-wrapper').h1.a['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find('div', id='row-1')\ + .find('article', class_='headlines')\ + .find('div', id='tab-content')\ + .find_all('li', class_=['story', 'wirestory']) + h3s = tuple(x.div.h1.a['href'] for x in h3s) + + return h1s, h2s, h3s + + @classmethod + def _normalize_url(cls, url): + """ + ABC News urls include an 'id' query param that we need to + keep in order for the URL to work. + """ + return NewsSource._normalize_url(url, ['id']) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py new file mode 100644 index 0000000..9f51287 --- /dev/null +++ b/unbiased/sources/base.py @@ -0,0 +1,210 @@ +import collections +import html +import logging +import urllib + +from bs4 import BeautifulSoup +import requests + +logger = logging.getLogger('unbiased') + +class Article(object): + + def __init__(self, source, title, author, description, url, img): + self.source = source + self.title = title + self.author = author + self.description = description + self.url = url + self.img = img + + def __repr__(self): + return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img) + +class NewsSource(object): + """ + Abstract base class. + To implement: + - set 'name', 'shortname', and 'url' + - set 'bad_' variables to blacklist terms and phrases + - implement '_fetch_urls()', which should return three tuples + of urls, one for each tier + - override any of the '_get_*()' functions as necessary + """ + + name = None + shortname = None + url = None + + bad_titles = None + bad_authors = None + bad_descriptions = None + bad_imgs = None + bad_urls = None + + def __init__(self, h1s, h2s, h3s): + self.h1s = h1s + self.h2s = h2s + self.h3s = h3s + + @classmethod + def build(cls): + h1s, h2s, h3s = cls._fetch_urls() + logger.info('Fetched {} h1s, {} h2s, {} h3s'.format(len(h1s), len(h2s), len(h3s))) + h1s = tuple(cls._normalize_url(x) for x in h1s) + h2s = tuple(cls._normalize_url(x) for x in h2s) + h3s = tuple(cls._normalize_url(x) for x in h3s) + h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s) + h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s) + h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s) + return cls(h1s, h2s, h3s) + + @classmethod + def _fetch_content(cls, url): + res = requests.get(url) + if res.status_code == 200: + content = res.text + else: + raise Exception("Failed to download {}".format(url)) + return BeautifulSoup(content, 'lxml') + + @classmethod + def _normalize_url(cls, url, keep_query_vars=None): + """ + Make sure they have a scheme. + Make sure they have a host. + Trim any query string, params, or fragments. + """ + cls_url = urllib.parse.urlparse(cls.url) + url = urllib.parse.urlparse(url) + if keep_query_vars is None: + query = '' + else: + query_vars = {} + qs = urllib.parse.parse_qs(url.query) + for v in keep_query_vars: + if v in qs: + query_vars[v] = qs[v] + query_pairs = [] + for k, i in query_vars.items(): + for v in i: + query_pairs.append('{}={}'.format(k, v)) + query = '?'.join(query_pairs) + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '') + return urllib.parse.urlunparse(url) + + @classmethod + def _remove_duplicates(cls, h1s, h2s, h3s): + h2s = tuple(x for x in h2s if x not in h1s) + h3s = tuple(x for x in h3s if x not in h1s and x not in h2s) + return h1s, h2s, h3s + + @classmethod + def _remove_bad_stories(cls, articles, element, filters): + # TODO: replace string filters with regex filters + if filters is None: + return articles + new_articles = [] + for article in articles: + save = True + for f in filters: + if getattr(article, element) and f in getattr(article, element): + save = False + break + if save: + new_articles.append(article) + return tuple(new_articles) + + @classmethod + def _remove_all_bad_stories(cls, h1s, h2s, h3s): + new_articles = [] + for articles in [h1s, h2s, h3s]: + articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles) + articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions) + articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors) + articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs) + articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls) + new_articles.append(articles) + if len(new_articles[0]) == 0 and len(new_articles[1]) > 0: + new_articles[0] = new_articles[0] + new_articles[1][:1] + new_articles[1] = new_articles[1][1:] + return tuple(tuple(x) for x in new_articles) + + @classmethod + def _fetch_articles(cls, h1s, h2s, h3s): + ret = [] + for urls in [h1s, h2s, h3s]: + articles = [] + for url in urls: + article = cls._fetch_article(url) + if article is not None: + articles.append(article) + ret.append(articles) + return tuple(tuple(x) for x in ret) + + @classmethod + def _fetch_article(cls, url): + logger.debug(cls.name) + logger.debug(url) + + try: + soup = cls._fetch_content(url) + except Exception as ex: + logger.debug("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + + try: + img = cls._get_image(soup) + img = urllib.parse.urlparse(img, scheme=scheme).geturl() + logger.debug(img) + + title = cls._get_title(soup) + logger.debug(title) + + author = cls._get_author(soup) + logger.debug(author) + + description = cls._get_description(soup) + logger.debug(description) + description = cls._remove_self_refs(description) + logger.debug(description) + except Exception: + logger.debug("""ARTICLE PARSING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + return Article(cls.name, title, author, description, url, img) + + @classmethod + def _get_image(cls, soup): + return soup.find('meta', property='og:image')['content'] + + @classmethod + def _get_title(cls, soup): + return soup.find('meta', property='og:title')['content'] + + @classmethod + def _get_author(cls, soup): + for author_tag in ['article:author', 'dc.creator', 'author']: + author = soup.find('meta', property=author_tag) + if author is None: + continue + return author['content'] + return None + + @classmethod + def _get_description(cls, soup): + return soup.find('meta', property='og:description')['content'] + + @classmethod + def _remove_self_refs(cls, description): + description = description.replace(cls.name + "'s", '***') + description = description.replace(cls.name + "'", '***') + description = description.replace(cls.name, '***') + return description diff --git a/unbiased/sources/bbc.py b/unbiased/sources/bbc.py new file mode 100644 index 0000000..0dd0f80 --- /dev/null +++ b/unbiased/sources/bbc.py @@ -0,0 +1,26 @@ +from unbiased.sources.base import NewsSource + +class BBC(NewsSource): + + name = 'BBC News' + shortname = 'bbc' + url = 'http://www.bbc.com/news/world/us_and_canada' + + bad_images = ['bbc_news_logo.png'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1s = soup.find('div', class_='buzzard-item')\ + .find('a', class_='title-link') + h1s = (h1s['href'],) + + h2s = soup.find_all('div', attrs={'class': 'pigeon__column', 'data-entityid': True}) + h2s = tuple(x.find('a', class_='title-link')['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find_all('div', attrs={'class': 'macaw-item', 'data-entityid': True}) + h3s = tuple(x.find('a', class_='title-link')['href'] for x in h3s) + + return h1s, h2s, h3s diff --git a/unbiased/sources/cbs.py b/unbiased/sources/cbs.py new file mode 100644 index 0000000..295e671 --- /dev/null +++ b/unbiased/sources/cbs.py @@ -0,0 +1,37 @@ +from unbiased.sources.base import NewsSource + +class CBS(NewsSource): + + name = 'CBS News' + shortname = 'cbs' + url = 'https://www.cbsnews.com/' + + bad_titles = ['60 Minutes'] + bad_descriptions = ['60 Minutes'] + bad_urls = ['whats-in-the-news-coverart'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('h1', class_='title') + # sometimes they lead with a video + # if so, we'll pull the first h2 into the h1 slot later + if h1 is not None: + h1s = (h1.a['href'],) + + # get secondary headlines + h2s = soup.find('div', attrs={'data-tb-region': 'Big News Area Side Assets'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h2s = tuple(x.a['href'] for x in h2s) + if h1 is None: + h1s = (h2s[0],) + h2s = tuple(h2s[1:]) + + # get tertiary headlines + h3s = soup.find('div', attrs={'data-tb-region': 'Hard News'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h3s = tuple(x.a['href'] for x in h3s[:5]) + + return h1s, h2s, h3s diff --git a/unbiased/sources/csm.py b/unbiased/sources/csm.py new file mode 100644 index 0000000..4e1eea5 --- /dev/null +++ b/unbiased/sources/csm.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class CSM(NewsSource): + + name = 'Christian Science Monitor' + shortname = 'csm' + url = 'https://www.csmonitor.com/USA' + + bad_titles = ['Change Agent'] + bad_imgs = ['csm_logo'] + bad_urls = ['difference-maker'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='block-0-0')\ + .find('h3', class_='story_headline')\ + .a['href'] + h1s = (h1,) + + # get secondary headlines + h2_blocks = soup.find_all('div', id=['block-1-0', 'block-0-1']) + h2s = [] + for block in h2_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h2s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3_blocks = soup.find_all('div', id='block-0-2') + h3s = [] + for block in h3_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h3s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h3s = tuple(x['href'] for x in h3s) + + return h1s, h2s, h3s diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py new file mode 100644 index 0000000..ce7730f --- /dev/null +++ b/unbiased/sources/fox.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class Fox(NewsSource): + + name = 'Fox News' + shortname = 'Fox' + url = 'http://www.foxnews.com' + + bad_titles = ['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] + bad_descriptions = ['Sean Hannity'] + bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] + bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] + bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com'] + + @classmethod + def _fetch_urls(cls): + """ + Returns three tuples of urls, one for each of + the three tiers. + """ + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='big-top')\ + .find('div', class_='primary')\ + .find('h1')\ + .find('a')['href'] + h1s = (h1,) + + # get secondary headlines + h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a') + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3s = [] + for ul in soup.find('section', id='latest').find_all('ul', recursive=False): + for li in ul.find_all('li', recursive=False): + h3s.append(li.find('a')['href']) + h3s = tuple(h3s) + + return h1s, h2s, h3s diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py new file mode 100644 index 0000000..5a1c3dd --- /dev/null +++ b/unbiased/sources/guardian.py @@ -0,0 +1,37 @@ +import html + +from unbiased.sources.base import NewsSource + +class TheGuardian(NewsSource): + + name = 'The Guardian' + shortname = 'Guardian' + url = 'https://www.theguardian.com/us' + + bad_authors = ['Tom McCarthy', 'Andy Hunter'] + bad_urls = ['https://www.theguardian.com/profile/ben-jacobs'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + url_groups = [] + for htag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + hblocks = soup.find('section', id='headlines').find_all(htag) + urls = [x.a['href'] for x in hblocks] + url_groups.append(urls) + url_groups = [x for x in url_groups if len(url_groups) > 0] + if len(url_groups) < 3: + raise Exception('not enough article groups on Guardian home page!') + + return tuple(url_groups[0]), tuple(url_groups[1]), tuple(url_groups[2]) + + @classmethod + def _get_image(cls, soup): + if soup.find('img', class_='maxed'): + img = soup.find('img', class_='maxed')['src'] + if soup.find('meta', itemprop='image'): + img = soup.find('meta', itemprop='image')['content'] + if soup.find('img', class_='immersive-main-media__media'): + img = soup.find('img', class_='immersive-main-media__media')['src'] + return html.unescape(img) diff --git a/unbiased/sources/npr.py b/unbiased/sources/npr.py new file mode 100644 index 0000000..e52459f --- /dev/null +++ b/unbiased/sources/npr.py @@ -0,0 +1,29 @@ +from unbiased.sources.base import NewsSource + +class NPR(NewsSource): + + name = 'NPR News' + shortname = 'npr' + url = 'http://www.npr.org/sections/news/' + + bad_titles = ['The Two-Way'] + bad_authors = ['Domenico Montanaro'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + featured = soup.find('div', class_='featured-3-up')\ + .find_all('article', recursive=False) + + h1s = featured[:1] + h1s = tuple(x.find('h2', class_='title').a['href'] for x in h1s) + h2s = featured[1:] + h2s = tuple(x.find('h2', class_='title').a['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find('div', id='overflow')\ + .find_all('article', recursive=False) + h3s = tuple(x.find('h2', class_='title').a['href'] for x in h3s[:5]) + + return h1s, h2s, h3s diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py new file mode 100644 index 0000000..862204e --- /dev/null +++ b/unbiased/sources/thehill.py @@ -0,0 +1,39 @@ + +from unbiased.sources.base import NewsSource + +class TheHill(NewsSource): + + name = 'The Hill' + shortname = 'Hill' + url = 'http://thehill.com' + + bad_titles = ['THE MEMO'] + bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('h1', class_='top-story-headline')\ + .find('a')['href'] + h1s = (h1,) + + h23s = soup.find('div', class_='section-top-content')\ + .find_all('div', class_='top-story-item') + h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) + h2s = tuple(h2s) + + h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) + h3s = tuple(h3s) + + return h1s, h2s, h3s + + @classmethod + def _get_description(cls, soup): + try: + return NewsSource._get_description(soup) + except Exception: + # fall back on grabbing text from the article + desc = soup.find('div', class_='field-items') + return desc.text[:200].rsplit(' ', 1)[0] + diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py new file mode 100644 index 0000000..1be1838 --- /dev/null +++ b/unbiased/sources/washtimes.py @@ -0,0 +1,31 @@ +from unbiased.sources.base import NewsSource + +class TheWashingtonTimes(NewsSource): + + name = 'The Washington Times' + shortname = 'WashTimes' + url = 'http://www.washingtontimes.com/' + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('article', class_='lead-story')\ + .find(class_='article-headline')\ + .a['href'] + h1s = (h1,) + + top_articles = soup.find('section', class_='top-news')\ + .find_all('article', recursive=False) + h2s = [] + for a in top_articles: + if a.attrs.get('class') is None: + h2s.append(a.a['href']) + h2s = tuple(h2s) + + h3s = soup.find('section', class_='more-from desktop-only')\ + .ul.find_all('a') + h3s = [x['href'] for x in h3s] + h3s = tuple(h3s) + + return h1s, h2s, h3s diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py deleted file mode 100755 index 7ce50d3..0000000 --- a/unbiased/spotCheck.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -from unbiased.parser import * -from unbiased.unbiasedObjects import * - -def spotCheck(src): - - fns = {'hil' : buildTheHill, - 'cbs' : buildCBS, - 'npr' : buildNPR, - 'fox' : buildFoxNews, - 'gdn' : buildGuardian, - 'blz' : buildBlaze, - 'bbc' : buildBBC, - 'nbc' : buildNBC, - 'wat' : buildWashTimes, - 'csm' : buildCSM, - 'abc' : buildABC} - - data=fns[src]() - - print('H1s:\n--------------') - for h in data.h1Arr: - print(h.title) - - print('\n\nH2s:\n--------------') - for h in data.h2Arr: - print(h.title) - - print('\n\nH3s:\n--------------') - for h in data.h3Arr: - print(h.title) - - print('\n\n') - - - -if __name__=='__main__': - spotCheck(sys.argv[1]) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py deleted file mode 100644 index 6ec89b7..0000000 --- a/unbiased/unbiasedFunctions.py +++ /dev/null @@ -1,277 +0,0 @@ -import html
-import io
-import logging
-import os
-import pkgutil
-import random
-import re
-import time
-import urllib.parse
-
-from PIL import Image
-import requests
-
-from unbiased.unbiasedObjects import *
-
-logger = logging.getLogger('unbiased')
-
-#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
-
- debugging=False
- if debugging:
- logger.debug(sourceName)
- logger.debug(url)
-
- url_parts = urllib.parse.urlparse(url)
- scheme = url_parts.scheme
-
- #download url
- try:
- res = requests.get(url)
- except Exception as ex:
- logger.debug("""ARTICLE DOWNLOADING ERROR
- SOURCE:\t{}
- URL:\t{}""".format(sourceName, url))
- return None
-
- if res.status_code == 200:
- content = res.text
- else:
- logger.debug("""ARTICLE DOWNLOADING ERROR
- SOURCE:\t{}
- URL:\t{}""".format(sourceName, url))
- return None
-
- try:
- if sourceName=='The Guardian US':
- #The Guardian puts an identifying banner on their og:images
- #grab the main image from the page instead
-
- #scenario 1: regular image
- if '<img class="maxed' in content:
- img=content.split('<img class="maxed', 1)[1]
- img=img.split('src="', 1)[1].split('"')[0]
- #scenario 2: video in image spot
- elif '<meta itemprop="image"' in content:
- img=content.split('<meta itemprop="image"', 1)[1]
- img=img.split('content="', 1)[1].split('"')[0]
- #scenario 3: photo essays
- elif '<img class="immersive-main-media__media"' in content:
- img=content.split('<img class="immersive-main-media__media"', 1)[1]
- img=img.split('src="', 1)[1].split('"')[0]
- img = html.unescape(img)
-
- else:
- if 'og:image' in content:
- img=content.split('og:image" content=')[1][1:].split('>')[0]
- elif sourceName=='ABC News':
- img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
- if img[-1]=='/':
- #because the quote separator could be ' or ",
- #trim to just before it then lop it off
- img=img[:-1].strip()
- img=img[:-1]
- # fix the scheme if it's missing
- img = urllib.parse.urlparse(img, scheme=scheme).geturl()
-
- if debugging:
- logger.debug(img)
-
- title=content.split('og:title" content=')[1][1:].split('>')[0]
- if title[-1]=='/':
- title=title[:-1].strip()
- title=title[:-1]
-
- if debugging:
- logger.debug(title)
-
-
- author=''
- if sourceName=='The Blaze':
- if 'class="article-author">' in content:
- author=content.split('class="article-author">')[1].split('<')[0]
- elif 'class="article-author" href="' in content:
- author=content.split('class="article-author" href="')[1]
- author=author.split('>')[1].split('<')[0].strip()
- else:
- authorTags=['article:author', 'dc.creator', 'property="author']
- for tag in authorTags:
- if tag in content:
- author=content.split(tag+'" content=')[1][1:].split('>')[0]
- author=author[:-1]
- #trim an extra quotation mark for The Hill
- if sourceName=='The Hill':
- author=author.split('"', 1)[0]
- break
-
- if debugging:
- logger.debug(author)
-
-
- if 'og:description' in content:
- description=content.split('og:description" content=')[1][1:].split('>')[0]
- if description[-1]=='/':
- description=description[:-1].strip()
- description=description[:-1]
- else:
- if sourceName=='The Hill':
- description=content.split('div class="field-items"')[-1]
- description=re.sub('<[^<]+?>', '', description)
- description=description[1:200]
- else:
- logger.debug("SHOULDN'T GET HERE")
-
- #strip out self-references
- description=description.replace(sourceName+"'s", '***')
- description=description.replace(sourceName+"'", '***')
- description=description.replace(sourceName, '***')
-
- if debugging:
- logger.debug(description)
-
-
- a=Article(html.unescape(title), url, img, html.unescape(description), sourceName, html.unescape(author))
- return a
-
- except Exception:
- logger.debug("""ARTICLE PARSING ERROR
- SOURCE:\t{}
- URL:\t{}""".format(sourceName, url))
- return None
-
-
-def pickStories(newsSourceArr):
- # TODO: refactor to avoid infinite loops
- #set the random order for sources
- h1RandomSources=[]
- guard = 0
- while len(h1RandomSources)<4:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h1Arr)>0:
- if x not in h1RandomSources:
- h1RandomSources.append(x)
- else:
- logger.debug('No H1 stories in '+newsSourceArr[x].name)
- guard += 1
- if guard > 100:
- return [], [], []
-
- #For h2s and h3s, select N random sources (can repeat), then
- #a non-repetitive random article from within
- h2RandomPairs=[]
- while len(h2RandomPairs) < 6:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h2Arr) > 0:
- y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
- pair=[x,y]
- if not pair in h2RandomPairs:
- h2RandomPairs.append(pair)
- else:
- logger.debug('No H2 stories in '+newsSourceArr[x].name)
-
- h3RandomPairs=[]
- while len(h3RandomPairs) < 12:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h3Arr) > 0:
- y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
- pair=[x,y]
- if not pair in h3RandomPairs:
- h3RandomPairs.append(pair)
- else:
- logger.debug('No H3 stories in '+newsSourceArr[x].name)
-
- # collect articles for each section
- image_index = 0
-
- top_stories = []
- for i in range(len(h1RandomSources)):
- source=newsSourceArr[h1RandomSources[i]]
- randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
- article=source.h1Arr[randomArticle]
- top_stories.append(article)
-
- middle_stories = []
- for i in range(len(h2RandomPairs)):
- pair=h2RandomPairs[i]
- article=newsSourceArr[pair[0]].h2Arr[pair[1]]
- middle_stories.append(article)
-
- bottom_stories = []
- for i in range(len(h3RandomPairs)):
- pair=h3RandomPairs[i]
- article=newsSourceArr[pair[0]].h3Arr[pair[1]]
- bottom_stories.append(article)
-
- return top_stories, middle_stories, bottom_stories
-
-def buildOutput(top_stories, middle_stories, bottom_stories):
- #read in the template html file
- from jinja2 import Environment, PackageLoader, select_autoescape
- env = Environment(
- loader=PackageLoader('unbiased', 'html_template'),
- autoescape=select_autoescape(['html', 'xml'])
- )
- template = env.get_template('unbiased.jinja.html')
-
- timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
- utime = int(time.time())
-
- sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories]))
-
- html = template.render(
- timestamp = timestamp,
- utime = utime,
- top_stories = top_stories,
- middle_stories = middle_stories,
- bottom_stories = bottom_stories,
- sources = sourcesStr,
- )
-
- #return updated text
- return html
-
-def writeOutputHTML(outputHTML, outDir):
- timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
-
- with open(os.path.join(outDir, 'index.html'), 'w') as fp:
- fp.write(outputHTML)
-
- # copy over static package files
- for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']:
- data = pkgutil.get_data('unbiased', os.path.join('html_template', filename))
- with open(os.path.join(outDir, filename), 'wb') as fp:
- fp.write(data)
-
-def pullImage(url, index, webroot, target_width=350, target_height=200):
- extension = url.split('.')[-1].split('?')[0]
- img_name = 'img{}.{}'.format(index, extension)
- res = requests.get(url)
- if res.status_code == 200:
- content = res.content
- else:
- logger.debug('Image not found: url={}'.format(url))
- return ''
- img = Image.open(io.BytesIO(content))
- # crop to aspect ratio
- target_ar = target_width / target_height
- left, top, right, bottom = img.getbbox()
- height = bottom - top
- width = right - left
- ar = width / height
- if target_ar > ar:
- new_height = (target_height / target_width) * width
- bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
- img = img.crop(bbox)
- elif target_ar < ar:
- new_width = (target_width / target_height) * height
- bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
- img = img.crop(bbox)
- # resize if larger
- if target_width * 2 < width or target_height * 2 < height:
- img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
- # TODO: create retina images
- jpg_name = 'img{}.jpg'.format(index)
- out_file = os.path.join(webroot, jpg_name)
- img.save(out_file, 'JPEG')
- return jpg_name
diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py deleted file mode 100644 index 9a8a78a..0000000 --- a/unbiased/unbiasedObjects.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging
-
-logger = logging.getLogger('unbiased')
-
-class Article():
- title=''
- url=''
- img=''
- description=''
- source=''
- author=''
-
- def __init__(self, title, url, img, description, source, author):
- self.title=title
- self.url=url
- self.img=img
- self.description=description
- self.source=source
- self.author=author
-
- def __str__(self):
- return '-----------\ntitle: {}\nauthor: {}\nsource: {}\ndescription: {}\nurl: {}\nimg: {}\n-----------'.format(self.title, self.author, self.source, self.description, self.url, self.img)
-
- def __repr__(self):
- return '{}({}, {}, {})'.format(self.source.replace(' ', ''), self.title, self.author, self.url)
-
-
-class NewsSource2():
- name=''
- url=''
- h1Arr=[]
- h2Arr=[]
- h3Arr=[]
- def __init__(self, name, url, h1Arr, h2Arr, h3Arr):
- self.name=name
- self.url=url
- self.h1Arr=h1Arr
- self.h2Arr=h2Arr
- self.h3Arr=h3Arr
-
-
-
-class NewsSource():
- name=''
- url=''
- #multiple start values to step through file. end value default to '"'
- h1SectionDividerStart=None
- h1SectionDividerEnd=None
- h1DelStart=[]
- h1DelEnd='"'
- h2SectionDividerStart=None
- h2SectionDividerEnd=None
- h2DelStart=[]
- h2DelEnd='"'
- h3SectionDividerStart=None
- h3SectionDividerEnd=None
- h3DelStart=[]
- h3DelEnd='"'
- #arrays of Article object types
- h1Arr=None
- h2Arr=None
- h3Arr=None
- #url to attach to stub links
- stubURL=''
-
- def __init__(self, name, url,
- h1DelStart, h2DelStart, h3DelStart,
- h1SectionDividerStart=None, h1SectionDividerEnd=None,
- h2SectionDividerStart=None, h2SectionDividerEnd=None,
- h3SectionDividerStart=None, h3SectionDividerEnd=None,
- stubURL=None):
- self.name=name
- self.url=url
- self.h1DelStart=h1DelStart
- self.h2DelStart=h2DelStart
- self.h3DelStart=h3DelStart
- self.h1SectionDividerStart=h1SectionDividerStart
- self.h2SectionDividerStart=h2SectionDividerStart
- self.h3SectionDividerStart=h3SectionDividerStart
- self.h1SectionDividerEnd=h1SectionDividerEnd
- self.h2SectionDividerEnd=h2SectionDividerEnd
- self.h3SectionDividerEnd=h3SectionDividerEnd
- self.h1Arr=[]
- self.h2Arr=[]
- self.h3Arr=[]
- self.stubURL=stubURL
-
- def addArticle(self, article, level):
- if level==1:
- self.h1Arr.append(article)
- elif level==2:
- self.h2Arr.append(article)
- elif level==3:
- self.h3Arr.append(article)
- else:
- logger.debug("Invalid level in NewsSource.addArtlce: " + level)
-
diff --git a/unbiased/util.py b/unbiased/util.py new file mode 100644 index 0000000..12003b1 --- /dev/null +++ b/unbiased/util.py @@ -0,0 +1,113 @@ +import io
+import logging
+import os
+import pkgutil
+import random
+import shutil
+import time
+
+from PIL import Image
+import requests
+
+logger = logging.getLogger('unbiased')
+
+
+def pick_randoms(story_lists, length, per_source):
+ """
+ Return a randomly chosen list of 'length' stories, picking at
+ most 'per_source' stories from each source.
+ """
+ # TODO: weighting is incorrect if a source has fewer than 'per_source' articles
+ urandom = random.SystemRandom()
+ candidates = []
+ for stories in story_lists:
+ indexes = list(range(len(stories)))
+ urandom.shuffle(indexes)
+ random_indexes = indexes[:per_source]
+ candidates.extend([stories[x] for x in random_indexes])
+ indexes = list(range(len(candidates)))
+ urandom.shuffle(indexes)
+ random_indexes = indexes[:length]
+ return tuple(candidates[x] for x in random_indexes)
+
+
+def pickStories(newsSourceArr):
+ h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1)
+ h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2)
+ h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2)
+ return h1s, h2s, h3s
+
+
+def buildOutput(top_stories, middle_stories, bottom_stories):
+ # read in the template html file
+ from jinja2 import Environment, PackageLoader, select_autoescape
+ env = Environment(
+ loader=PackageLoader('unbiased', 'html_template'),
+ autoescape=select_autoescape(['html', 'xml'])
+ )
+ template = env.get_template('unbiased.jinja.html')
+
+ timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
+ utime = int(time.time())
+
+ sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories]))
+
+ html = template.render(
+ timestamp=timestamp,
+ utime=utime,
+ top_stories=top_stories,
+ middle_stories=middle_stories,
+ bottom_stories=bottom_stories,
+ sources=sourcesStr,
+ )
+
+ return html
+
+
+def write_files(files_to_write, outDir):
+ for name, bytesio in files_to_write.items():
+ with open(os.path.join(outDir, name), 'wb') as fp:
+ shutil.copyfileobj(bytesio, fp)
+
+
+def write_static_files(outDir):
+ # copy over static package files
+ for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']:
+ data = pkgutil.get_data('unbiased', os.path.join('html_template', filename))
+ with open(os.path.join(outDir, filename), 'wb') as fp:
+ fp.write(data)
+
+
+def pullImage(url, index, webroot, target_width=350, target_height=200):
+ res = requests.get(url)
+ if res.status_code == 200:
+ content = res.content
+ else:
+ logger.debug('Image not found: url={}'.format(url))
+ return ''
+ img = Image.open(io.BytesIO(content))
+ # crop to aspect ratio
+ target_ar = target_width / target_height
+ left, top, right, bottom = img.getbbox()
+ height = bottom - top
+ width = right - left
+ ar = width / height
+ if target_ar > ar:
+ new_height = (target_height / target_width) * width
+ bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
+ img = img.crop(bbox)
+ elif target_ar < ar:
+ new_width = (target_width / target_height) * height
+ bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
+ img = img.crop(bbox)
+ # resize if larger
+ if target_width * 2 < width or target_height * 2 < height:
+ img = img.resize((target_width * 2, target_height * 2), Image.LANCZOS)
+ # TODO: fill with a neutral color instead of just discarding alpha channel
+ img = img.convert('RGB')
+ # TODO: create retina images
+ jpg_name = 'img{}.jpg'.format(index)
+ jpg_file = io.BytesIO()
+ img.save(jpg_file, 'JPEG')
+ jpg_file.seek(0)
+ return jpg_name, jpg_file
|