From d0c7c0541013cc9472b38ccfd614a314e9a86d70 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 10 Sep 2017 13:09:12 -0400 Subject: major refactor of news source building --- requirements.txt | 2 +- unbiased/main.py | 62 +++++------- unbiased/sources/__init__.py | 10 ++ unbiased/sources/base.py | 222 ++++++++++++++++++++++++++++++++++++++++++ unbiased/sources/fox.py | 41 ++++++++ unbiased/unbiasedFunctions.py | 87 +++++------------ 6 files changed, 320 insertions(+), 104 deletions(-) create mode 100644 unbiased/sources/__init__.py create mode 100644 unbiased/sources/base.py create mode 100644 unbiased/sources/fox.py diff --git a/requirements.txt b/requirements.txt index 3767095..0d53cea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ beautifulsoup4~=4.6.0 Jinja2~=2.9.6 -lxml=~=3.8.0 +lxml~=3.8.0 Pillow~=4.2.1 requests~=2.18.4 diff --git a/unbiased/main.py b/unbiased/main.py index 4ccda24..949e646 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -5,9 +5,8 @@ import logging import logging.config import time -from unbiased.unbiasedObjects import * -from unbiased.unbiasedFunctions import * -from unbiased.parser import * +from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, writeOutputHTML +from unbiased.sources import get_sources logger = logging.getLogger('unbiased') @@ -52,6 +51,7 @@ def main(): parser.add_argument('-l', '--log-dir', help='location to write detailed logs') parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode') parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit') + parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None) args = parser.parse_args() if args.log_dir: @@ -67,7 +67,7 @@ def main(): while True: logger.info('Starting crawl') start = time.time() - run(args.webroot) + run(args.webroot, args.sources) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -77,51 +77,33 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot): - sources = [] - - ''' - SOURCES TO ADD NEXT: - -REUTERS - -Town Hall - ''' - - logger.debug('Running with webroot="{}"'.format(webroot)) - - ### These values have to be the second half of the function name - ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr = [ - 'Guardian', - 'TheHill', - 'NPR', - 'BBC', - 'NBC', - 'CBS', - 'FoxNews', - 'WashTimes', - 'CSM', - 'ABC', - ] - - for source in sourceFnArr: - logger.info('Crawling {}'.format(source)) +def run(webroot, source_names): + + logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names)) + + sources = get_sources() + print(sources) + if source_names is None: + sources = sources.values() + else: + sources = [sources[x] for x in source_names] + + built_sources = [] + for source in sources: + logger.info('Crawling {}'.format(source.name)) tries = 0 while tries < 3: time.sleep(tries) try: - fn = 'build' + source - possibles = globals().copy() - possibles.update(locals()) - method = possibles.get(fn) - src = method() - sources.append(src) + built_sources.append(source.build()) break except Exception as ex: tries += 1 if tries == 3: - logger.error('Build failed. source={} ex={}'.format(source, ex)) + logger.error('Build failed. source={} ex={}'.format(source.name, ex)) else: - logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex)) + sources = tuple(built_sources) logger.info('Parsed home pages for: {}'.format([x.name for x in sources])) top_stories, middle_stories, bottom_stories = pickStories(sources) diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py new file mode 100644 index 0000000..e4a473a --- /dev/null +++ b/unbiased/sources/__init__.py @@ -0,0 +1,10 @@ +import importlib +import pkgutil + +from unbiased.sources.base import NewsSource + +def get_sources(): + for loader, name, is_pkg in pkgutil.walk_packages(__path__): + if name != 'base': + importlib.import_module('unbiased.sources.' + name) + return {x.shortname.lower(): x for x in NewsSource.__subclasses__()} diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py new file mode 100644 index 0000000..9dc14fd --- /dev/null +++ b/unbiased/sources/base.py @@ -0,0 +1,222 @@ +import collections +import html +import logging +import urllib + +from bs4 import BeautifulSoup +import requests + +logger = logging.getLogger('unbiased') + +class Article(object): + + def __init__(self, source, title, author, description, url, img): + self.source = source + self.title = title + self.author = author + self.description = description + self.url = url + self.img = img + + def __repr__(self): + return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img) + +class NewsSource(object): + """ + Abstract base class. + To implement: + - set 'name', 'shortname', and 'url' + - set 'bad_' variables to blacklist terms and phrases + - implement '_fetch_urls()', which should return three tuples + of urls, one for each tier + - override any of the '_get_*()' functions as necessary + """ + # TODO: replace all string parsing with bs4 + + name = None + shortname = None + url = None + + bad_titles = None + bad_authors = None + bad_descriptions = None + bad_imgs = None + bad_urls = None + + def __init__(self, h1s, h2s, h3s): + self.h1s = h1s + self.h2s = h2s + self.h3s = h3s + + @classmethod + def build(cls): + h1s, h2s, h3s = cls._fetch_urls() + h1s = tuple(cls._fix_url(x) for x in h1s) + h2s = tuple(cls._fix_url(x) for x in h2s) + h3s = tuple(cls._fix_url(x) for x in h3s) + h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s) + h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s) + h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s) + return cls(h1s, h2s, h3s) + + @classmethod + def _fetch_content(cls, url): + res = requests.get(url) + if res.status_code == 200: + content = res.text + else: + raise Exception("Failed to download {}".format(url)) + return BeautifulSoup(content, 'lxml') + + @classmethod + def _fix_url(cls, url, scheme='http'): + """ + Make sure they have a scheme. + Trim any query parameters. + """ + # TODO: proper URL parsing + if url.startswith('//'): + url = '{}:{}'.format(scheme, x) + url = url.split('?')[0] + return url + + @classmethod + def _remove_duplicates(cls, h1s, h2s, h3s): + h2s = tuple(x for x in h2s if x not in h1s) + h3s = tuple(x for x in h3s if x not in h1s and x not in h2s) + return h1s, h2s, h3s + + @classmethod + def _remove_bad_stories(cls, articles, element, filters): + if filters is None: + return articles + new_articles = [] + for article in articles: + save = True + for f in filters: + if f in getattr(article, element): + save = False + break + if save: + new_articles.append(article) + return tuple(new_articles) + + @classmethod + def _remove_all_bad_stories(cls, h1s, h2s, h3s): + new_articles = [] + for articles in [h1s, h2s, h3s]: + articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles) + articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions) + articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors) + articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs) + articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls) + new_articles.append(articles) + if len(new_articles[0]) == 0 and len(new_articles[1]) > 0: + new_articles[0].append(new_articles[1].pop()) + return tuple(tuple(x) for x in new_articles) + + @classmethod + def _fetch_articles(cls, h1s, h2s, h3s): + ret = [] + for urls in [h1s, h2s, h3s]: + articles = [] + for url in urls: + article = cls._fetch_article(url) + if article is not None: + articles.append(article) + ret.append(articles) + return tuple(tuple(x) for x in ret) + + @classmethod + def _fetch_article(cls, url): + #soup = cls._fetch_content(url) + + logger.debug(cls.name) + logger.debug(url) + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + + # download url + try: + res = requests.get(url) + except Exception as ex: + logger.debug("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + if res.status_code == 200: + content = res.text + else: + logger.debug("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + try: + img = cls._get_image(content) + img = urllib.parse.urlparse(img, scheme=scheme).geturl() + logger.debug(img) + + title = cls._get_title(content) + logger.debug(title) + + author = cls._get_author(content) + logger.debug(author) + + description = cls._get_description(content) + logger.debug(description) + description = cls._remove_self_refs(description) + logger.debug(description) + except Exception: + logger.debug("""ARTICLE PARSING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + return Article(cls.name, title, author, description, url, img) + + @classmethod + def _get_image(cls, content): + img = content.split('og:image" content=')[1][1:].split('>')[0] + if img[-1] == '/': + #because the quote separator could be ' or ", + #trim to just before it then lop it off + img = img[:-1].strip() + img = img[:-1] + return img + + @classmethod + def _get_title(cls, content): + title=content.split('og:title" content=')[1][1:].split('>')[0] + if title[-1]=='/': + title=title[:-1].strip() + title=title[:-1] + return title + + @classmethod + def _get_author(cls, content): + author = '' + authorTags = ['article:author', 'dc.creator', 'property="author'] + for tag in authorTags: + if tag in content: + author = content.split(tag+'" content=')[1][1:].split('>')[0] + author = author[:-1] + break + return author + + @classmethod + def _get_description(cls, content): + description = content.split('og:description" content=')[1][1:].split('>')[0] + if description[-1] == '/': + description = description[:-1].strip() + description = description[:-1] + return description + + @classmethod + def _remove_self_refs(cls, description): + description = description.replace(cls.name+"'s", '***') + description = description.replace(cls.name+"'", '***') + description = description.replace(cls.name, '***') + return description diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py new file mode 100644 index 0000000..ce7730f --- /dev/null +++ b/unbiased/sources/fox.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class Fox(NewsSource): + + name = 'Fox News' + shortname = 'Fox' + url = 'http://www.foxnews.com' + + bad_titles = ['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] + bad_descriptions = ['Sean Hannity'] + bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] + bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] + bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com'] + + @classmethod + def _fetch_urls(cls): + """ + Returns three tuples of urls, one for each of + the three tiers. + """ + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='big-top')\ + .find('div', class_='primary')\ + .find('h1')\ + .find('a')['href'] + h1s = (h1,) + + # get secondary headlines + h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a') + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3s = [] + for ul in soup.find('section', id='latest').find_all('ul', recursive=False): + for li in ul.find_all('li', recursive=False): + h3s.append(li.find('a')['href']) + h3s = tuple(h3s) + + return h1s, h2s, h3s diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 6ec89b7..7825d93 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -11,8 +11,6 @@ import urllib.parse from PIL import Image import requests -from unbiased.unbiasedObjects import * - logger = logging.getLogger('unbiased') #take in a url and delimiters, return twitter card @@ -141,69 +139,30 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, return None -def pickStories(newsSourceArr): - # TODO: refactor to avoid infinite loops - #set the random order for sources - h1RandomSources=[] - guard = 0 - while len(h1RandomSources)<4: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h1Arr)>0: - if x not in h1RandomSources: - h1RandomSources.append(x) - else: - logger.debug('No H1 stories in '+newsSourceArr[x].name) - guard += 1 - if guard > 100: - return [], [], [] - - #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within - h2RandomPairs=[] - while len(h2RandomPairs) < 6: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h2Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] - pair=[x,y] - if not pair in h2RandomPairs: - h2RandomPairs.append(pair) - else: - logger.debug('No H2 stories in '+newsSourceArr[x].name) - - h3RandomPairs=[] - while len(h3RandomPairs) < 12: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h3Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] - pair=[x,y] - if not pair in h3RandomPairs: - h3RandomPairs.append(pair) - else: - logger.debug('No H3 stories in '+newsSourceArr[x].name) +def pick_randoms(story_lists, length, per_source): + """ + Return a randomly chosen list of 'length' stories, picking at + most 'per_source' stories from each source. + """ + # TODO: weighting is incorrect if a source has fewer than 'per_source' articles + urandom = random.SystemRandom() + candidates = [] + for stories in story_lists: + indexes = list(range(len(stories))) + urandom.shuffle(indexes) + random_indexes = indexes[:per_source] + candidates.extend([stories[x] for x in random_indexes]) + indexes = list(range(len(candidates))) + urandom.shuffle(indexes) + random_indexes = indexes[:length] + return tuple(candidates[x] for x in random_indexes) - # collect articles for each section - image_index = 0 - top_stories = [] - for i in range(len(h1RandomSources)): - source=newsSourceArr[h1RandomSources[i]] - randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] - article=source.h1Arr[randomArticle] - top_stories.append(article) - - middle_stories = [] - for i in range(len(h2RandomPairs)): - pair=h2RandomPairs[i] - article=newsSourceArr[pair[0]].h2Arr[pair[1]] - middle_stories.append(article) - - bottom_stories = [] - for i in range(len(h3RandomPairs)): - pair=h3RandomPairs[i] - article=newsSourceArr[pair[0]].h3Arr[pair[1]] - bottom_stories.append(article) - - return top_stories, middle_stories, bottom_stories +def pickStories(newsSourceArr): + h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) + h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) + h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) + return h1s, h2s, h3s def buildOutput(top_stories, middle_stories, bottom_stories): #read in the template html file @@ -270,6 +229,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200): # resize if larger if target_width * 2 < width or target_height * 2 < height: img = img.resize((target_width*2, target_height*2), Image.LANCZOS) + # TODO: fill with a neutral color instead of just discarding alpha channel + img = img.convert('RGB') # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) out_file = os.path.join(webroot, jpg_name) -- cgit v1.2.3 From 985ce4c540cd437b6e6475fb0e969f2aea0bd901 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 20:09:41 -0400 Subject: normalize urls using urllib --- unbiased/sources/base.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 9dc14fd..51c4cff 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -51,9 +51,9 @@ class NewsSource(object): @classmethod def build(cls): h1s, h2s, h3s = cls._fetch_urls() - h1s = tuple(cls._fix_url(x) for x in h1s) - h2s = tuple(cls._fix_url(x) for x in h2s) - h3s = tuple(cls._fix_url(x) for x in h3s) + h1s = tuple(cls._normalize_url(x) for x in h1s) + h2s = tuple(cls._normalize_url(x) for x in h2s) + h3s = tuple(cls._normalize_url(x) for x in h3s) h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s) h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s) h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s) @@ -69,16 +69,14 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _fix_url(cls, url, scheme='http'): + def _normalize_url(cls, url, scheme='http'): """ Make sure they have a scheme. - Trim any query parameters. + Trim any query string, params, or fragments. """ - # TODO: proper URL parsing - if url.startswith('//'): - url = '{}:{}'.format(scheme, x) - url = url.split('?')[0] - return url + url = urllib.parse.urlparse(url) + url = (url.scheme or scheme, url.netloc, url.path, '', '', '') + return urllib.parse.urlunparse(url) @classmethod def _remove_duplicates(cls, h1s, h2s, h3s): -- cgit v1.2.3 From 3681d1883a02962a331b5ed0c4cb085ba86b0324 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 20:40:18 -0400 Subject: update NewsSource base parsing to use beautifulsoup --- unbiased/sources/base.py | 74 ++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 50 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 51c4cff..4ff7bf3 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -86,13 +86,14 @@ class NewsSource(object): @classmethod def _remove_bad_stories(cls, articles, element, filters): + # TODO: replace string filters with regex filters if filters is None: return articles new_articles = [] for article in articles: save = True for f in filters: - if f in getattr(article, element): + if getattr(article, element) and f in getattr(article, element): save = False break if save: @@ -127,43 +128,32 @@ class NewsSource(object): @classmethod def _fetch_article(cls, url): - #soup = cls._fetch_content(url) - logger.debug(cls.name) logger.debug(url) - url_parts = urllib.parse.urlparse(url) - scheme = url_parts.scheme - - # download url try: - res = requests.get(url) + soup = cls._fetch_content(url) except Exception as ex: logger.debug("""ARTICLE DOWNLOADING ERROR SOURCE:\t{} URL:\t{}""".format(cls.name, url)) return None - if res.status_code == 200: - content = res.text - else: - logger.debug("""ARTICLE DOWNLOADING ERROR - SOURCE:\t{} - URL:\t{}""".format(cls.name, url)) - return None + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme try: - img = cls._get_image(content) + img = cls._get_image(soup) img = urllib.parse.urlparse(img, scheme=scheme).geturl() logger.debug(img) - title = cls._get_title(content) + title = cls._get_title(soup) logger.debug(title) - author = cls._get_author(content) + author = cls._get_author(soup) logger.debug(author) - description = cls._get_description(content) + description = cls._get_description(soup) logger.debug(description) description = cls._remove_self_refs(description) logger.debug(description) @@ -176,45 +166,29 @@ class NewsSource(object): return Article(cls.name, title, author, description, url, img) @classmethod - def _get_image(cls, content): - img = content.split('og:image" content=')[1][1:].split('>')[0] - if img[-1] == '/': - #because the quote separator could be ' or ", - #trim to just before it then lop it off - img = img[:-1].strip() - img = img[:-1] - return img + def _get_image(cls, soup): + return soup.find('meta', property='og:image')['content'] @classmethod - def _get_title(cls, content): - title=content.split('og:title" content=')[1][1:].split('>')[0] - if title[-1]=='/': - title=title[:-1].strip() - title=title[:-1] - return title + def _get_title(cls, soup): + return soup.find('meta', property='og:title')['content'] @classmethod - def _get_author(cls, content): - author = '' - authorTags = ['article:author', 'dc.creator', 'property="author'] - for tag in authorTags: - if tag in content: - author = content.split(tag+'" content=')[1][1:].split('>')[0] - author = author[:-1] - break - return author + def _get_author(cls, soup): + for author_tag in ['article:author', 'dc.creator', 'property="author']: + author = soup.find(author_tag) + if author is None: + continue + return author['content'] + return None @classmethod - def _get_description(cls, content): - description = content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1] == '/': - description = description[:-1].strip() - description = description[:-1] - return description + def _get_description(cls, soup): + return soup.find('meta', property='og:description')['content'] @classmethod def _remove_self_refs(cls, description): - description = description.replace(cls.name+"'s", '***') - description = description.replace(cls.name+"'", '***') + description = description.replace(cls.name + "'s", '***') + description = description.replace(cls.name + "'", '***') description = description.replace(cls.name, '***') return description -- cgit v1.2.3 From fd7a13e3c3641697358d97d23a45cda59bcea59a Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 20:54:35 -0400 Subject: ignore vim swp files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9e0f924..ad2b57a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ venv/ unbiased.egg-info/ #* .#* +*.swp -- cgit v1.2.3 From e53b324c148e81f4e4dff009670639825f2a2006 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 21:46:11 -0400 Subject: keep files in memory and only write to disk at the very end --- unbiased/main.py | 21 ++++++++++++++------- unbiased/sources/base.py | 1 - unbiased/unbiasedFunctions.py | 17 ++++++++++------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index 949e646..7b057ea 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 import argparse +import io import logging import logging.config import time -from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, writeOutputHTML +from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, write_files, write_static_files from unbiased.sources import get_sources logger = logging.getLogger('unbiased') @@ -111,20 +112,26 @@ def run(webroot, source_names): logger.info('Picked middle stories from: {}'.format([x.source for x in middle_stories])) logger.info('Picked bottom stories from: {}'.format([x.source for x in bottom_stories])) + files_to_write = {} + # download images img_idx = 0 for story in top_stories: - story.img = pullImage(story.img, img_idx, webroot, 350, 200) + story.img, img_jpg = pullImage(story.img, img_idx, webroot, 350, 200) + files_to_write[story.img] = img_jpg img_idx += 1 for story in middle_stories: - story.img = pullImage(story.img, img_idx, webroot, 150, 100) + story.img, img_jpg = pullImage(story.img, img_idx, webroot, 150, 100) + files_to_write[story.img] = img_jpg img_idx += 1 - #build the output file HTML - outputHTML = buildOutput(top_stories, middle_stories, bottom_stories) + # build the output file HTML + output_html = buildOutput(top_stories, middle_stories, bottom_stories) + output_html = io.BytesIO(output_html.encode('utf8')) + files_to_write['index.html'] = output_html - #print the output file HTML - writeOutputHTML(outputHTML, webroot) + write_files(files_to_write, webroot) + write_static_files(webroot) if __name__=="__main__": main() diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 4ff7bf3..68e7f0d 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -31,7 +31,6 @@ class NewsSource(object): of urls, one for each tier - override any of the '_get_*()' functions as necessary """ - # TODO: replace all string parsing with bs4 name = None shortname = None diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 7825d93..b07245c 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -5,6 +5,7 @@ import os import pkgutil import random import re +import shutil import time import urllib.parse @@ -190,12 +191,12 @@ def buildOutput(top_stories, middle_stories, bottom_stories): #return updated text return html -def writeOutputHTML(outputHTML, outDir): - timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - - with open(os.path.join(outDir, 'index.html'), 'w') as fp: - fp.write(outputHTML) +def write_files(files_to_write, outDir): + for name, bytesio in files_to_write.items(): + with open(os.path.join(outDir, name), 'wb') as fp: + shutil.copyfileobj(bytesio, fp) +def write_static_files(outDir): # copy over static package files for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) @@ -233,6 +234,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200): img = img.convert('RGB') # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) + jpg_file = io.BytesIO() out_file = os.path.join(webroot, jpg_name) - img.save(out_file, 'JPEG') - return jpg_name + img.save(jpg_file, 'JPEG') + jpg_file.seek(0) + return jpg_name, jpg_file -- cgit v1.2.3 From 0584698995cc748434cddd4a1a3baa56ff7aa180 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 11 Sep 2017 23:27:50 -0400 Subject: news source for The Hill --- unbiased/sources/base.py | 4 ++-- unbiased/sources/thehill.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 unbiased/sources/thehill.py diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 68e7f0d..b4bff75 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -174,8 +174,8 @@ class NewsSource(object): @classmethod def _get_author(cls, soup): - for author_tag in ['article:author', 'dc.creator', 'property="author']: - author = soup.find(author_tag) + for author_tag in ['article:author', 'dc.creator', 'author']: + author = soup.find('meta', property=author_tag) if author is None: continue return author['content'] diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py new file mode 100644 index 0000000..c678261 --- /dev/null +++ b/unbiased/sources/thehill.py @@ -0,0 +1,41 @@ +import urllib + +from unbiased.sources.base import NewsSource + +class TheHill(NewsSource): + + name = 'The Hill' + shortname = 'Hill' + url = 'http://thehill.com' + + bad_titles = ['THE MEMO'] + bad_authors = ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('h1', class_='top-story-headline')\ + .find('a')['href'] + h1 = urllib.parse.urljoin(cls.url, h1) + h1s = (h1,) + + h23s = soup.find('div', class_='section-top-content')\ + .find_all('div', class_='top-story-item') + h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) + h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + + h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) + h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + + return h1s, h2s, h3s + + @classmethod + def _get_description(cls, soup): + try: + return NewsSource._get_description(soup) + except Exception: + # fall back on grabbing text from the article + desc = soup.find('div', class_='field-items') + return desc.text[:200].rsplit(' ', 1)[0] + -- cgit v1.2.3 From 22473179b0677ad50fd0d3284726683cf00c54e0 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Tue, 12 Sep 2017 22:53:07 -0400 Subject: fix bug in shifting up tier two stories --- unbiased/sources/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index b4bff75..af0a53e 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -110,7 +110,8 @@ class NewsSource(object): articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls) new_articles.append(articles) if len(new_articles[0]) == 0 and len(new_articles[1]) > 0: - new_articles[0].append(new_articles[1].pop()) + new_articles[0] = new_articles[0] + new_articles[1][:1] + new_articles[1] = new_articles[1][1:] return tuple(tuple(x) for x in new_articles) @classmethod -- cgit v1.2.3 From 9b5f9b4f1be2563ebb639f90a943649d0165b7b8 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Tue, 12 Sep 2017 22:53:23 -0400 Subject: new source The Guardian --- unbiased/sources/guardian.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 unbiased/sources/guardian.py diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py new file mode 100644 index 0000000..dff098b --- /dev/null +++ b/unbiased/sources/guardian.py @@ -0,0 +1,38 @@ +import urllib +import html + +from unbiased.sources.base import NewsSource + +class TheGuardian(NewsSource): + + name = 'The Guardian' + shortname = 'Guardian' + url = 'https://www.theguardian.com/us' + + bad_authors = ['Tom McCarthy', 'Andy Hunter'] + bad_urls = ['https://www.theguardian.com/profile/ben-jacobs'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + url_groups = [] + for htag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + hblocks = soup.find('section', id='headlines').find_all(htag) + urls = [x.a['href'] for x in hblocks] + url_groups.append(urls) + url_groups = [x for x in url_groups if len(url_groups) > 0] + if len(url_groups) < 3: + raise Exception('not enough article groups on Guardian home page!') + + return tuple(url_groups[0]), tuple(url_groups[1]), tuple(url_groups[2]) + + @classmethod + def _get_image(cls, soup): + if soup.find('img', class_='maxed'): + img = soup.find('img', class_='maxed')['src'] + if soup.find('meta', itemprop='image'): + img = soup.find('meta', itemprop='image')['content'] + if soup.find('img', class_='immersive-main-media__media'): + img = soup.find('img', class_='immersive-main-media__media')['src'] + return html.unescape(img) -- cgit v1.2.3 From b50c2e7acc6ef45eb859acba645b628e444d7939 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Tue, 12 Sep 2017 22:53:36 -0400 Subject: new source The Washington Times --- unbiased/sources/washtimes.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 unbiased/sources/washtimes.py diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py new file mode 100644 index 0000000..e344af6 --- /dev/null +++ b/unbiased/sources/washtimes.py @@ -0,0 +1,34 @@ +import urllib + +from unbiased.sources.base import NewsSource + +class TheWashingtonTimes(NewsSource): + + name = 'The Washington Times' + shortname = 'WashTimes' + url = 'http://www.washingtontimes.com/' + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1 = soup.find('article', class_='lead-story')\ + .find(class_='article-headline')\ + .a['href'] + h1 = urllib.parse.urljoin(cls.url, h1) + h1s = (h1,) + + top_articles = soup.find('section', class_='top-news')\ + .find_all('article', recursive=False) + h2s = [] + for a in top_articles: + if a.attrs.get('class') is None: + h2s.append(a.a['href']) + h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + + h3s = soup.find('section', class_='more-from desktop-only')\ + .ul.find_all('a') + h3s = [x['href'] for x in h3s] + h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + + return h1s, h2s, h3s -- cgit v1.2.3 From 0854c3c73d38e75f8e30363f9a05b87a12c5290d Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Tue, 12 Sep 2017 23:03:26 -0400 Subject: update url with host in base parser --- unbiased/sources/base.py | 6 ++++-- unbiased/sources/guardian.py | 1 - unbiased/sources/thehill.py | 6 ++---- unbiased/sources/washtimes.py | 7 ++----- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index af0a53e..e91e5a8 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,13 +68,15 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url, scheme='http'): + def _normalize_url(cls, url): """ Make sure they have a scheme. + Make sure they have a host. Trim any query string, params, or fragments. """ + cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or scheme, url.netloc, url.path, '', '', '') + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') return urllib.parse.urlunparse(url) @classmethod diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py index dff098b..5a1c3dd 100644 --- a/unbiased/sources/guardian.py +++ b/unbiased/sources/guardian.py @@ -1,4 +1,3 @@ -import urllib import html from unbiased.sources.base import NewsSource diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py index c678261..862204e 100644 --- a/unbiased/sources/thehill.py +++ b/unbiased/sources/thehill.py @@ -1,4 +1,3 @@ -import urllib from unbiased.sources.base import NewsSource @@ -17,16 +16,15 @@ class TheHill(NewsSource): h1 = soup.find('h1', class_='top-story-headline')\ .find('a')['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) h23s = soup.find('div', class_='section-top-content')\ .find_all('div', class_='top-story-item') h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py index e344af6..1be1838 100644 --- a/unbiased/sources/washtimes.py +++ b/unbiased/sources/washtimes.py @@ -1,5 +1,3 @@ -import urllib - from unbiased.sources.base import NewsSource class TheWashingtonTimes(NewsSource): @@ -15,7 +13,6 @@ class TheWashingtonTimes(NewsSource): h1 = soup.find('article', class_='lead-story')\ .find(class_='article-headline')\ .a['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) top_articles = soup.find('section', class_='top-news')\ @@ -24,11 +21,11 @@ class TheWashingtonTimes(NewsSource): for a in top_articles: if a.attrs.get('class') is None: h2s.append(a.a['href']) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = soup.find('section', class_='more-from desktop-only')\ .ul.find_all('a') h3s = [x['href'] for x in h3s] - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s -- cgit v1.2.3 From e674ae4ca972e2f902dcc96d65fd4e792668b8a2 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 27 Sep 2017 21:17:17 -0400 Subject: let _normalize_urls() optionally preserve some query args --- unbiased/sources/base.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index e91e5a8..14d867e 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,7 +68,7 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url): + def _normalize_url(cls, url, keep_query_vars=None): """ Make sure they have a scheme. Make sure they have a host. @@ -76,7 +76,20 @@ class NewsSource(object): """ cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') + if keep_query_vars is None: + query = '' + else: + query_vars = {} + qs = urllib.parse.parse_qs(url.query) + for v in keep_query_vars: + if v in qs: + query_vars[v] = qs[v] + query_pairs = [] + for k, i in query_vars.items(): + for v in i: + query_pairs.append('{}={}'.format(k, v)) + query = '?'.join(query_pairs) + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '') return urllib.parse.urlunparse(url) @classmethod -- cgit v1.2.3 From 753b48246a8e3eb5bfffa77814ff297287951e03 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 27 Sep 2017 21:19:32 -0400 Subject: ABC news source, closes #7 --- unbiased/sources/abc.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 unbiased/sources/abc.py diff --git a/unbiased/sources/abc.py b/unbiased/sources/abc.py new file mode 100644 index 0000000..2ea7aff --- /dev/null +++ b/unbiased/sources/abc.py @@ -0,0 +1,43 @@ +from unbiased.sources.base import NewsSource + +class ABC(NewsSource): + + name = 'ABC News' + shortname = 'ABC' + url = 'http://abcnews.go.com/' + + @classmethod + def _fetch_urls(cls): + """ + Returns three tuples of urls, one for each of + the three tiers. + """ + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('article', class_='hero')\ + .find('div', class_='caption-wrapper').h1.a['href'] + h1s = (h1,) + print(h1) + + # get secondary headlines + h2s = soup.find('div', id='row-2')\ + .find_all('article', class_='card single row-item') + h2s = tuple(x.find('div', class_='caption-wrapper').h1.a['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find('div', id='row-1')\ + .find('article', class_='headlines')\ + .find('div', id='tab-content')\ + .find_all('li', class_=['story', 'wirestory']) + h3s = tuple(x.div.h1.a['href'] for x in h3s) + + return h1s, h2s, h3s + + @classmethod + def _normalize_url(cls, url): + """ + ABC News urls include an 'id' query param that we need to + keep in order for the URL to work. + """ + return NewsSource._normalize_url(url, ['id']) -- cgit v1.2.3 From beb04a9bb4935068926e167a38a3fdf9ec37c049 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 17:21:20 -0400 Subject: Christian Science Monitor closes #6 --- unbiased/sources/csm.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 unbiased/sources/csm.py diff --git a/unbiased/sources/csm.py b/unbiased/sources/csm.py new file mode 100644 index 0000000..4e1eea5 --- /dev/null +++ b/unbiased/sources/csm.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class CSM(NewsSource): + + name = 'Christian Science Monitor' + shortname = 'csm' + url = 'https://www.csmonitor.com/USA' + + bad_titles = ['Change Agent'] + bad_imgs = ['csm_logo'] + bad_urls = ['difference-maker'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='block-0-0')\ + .find('h3', class_='story_headline')\ + .a['href'] + h1s = (h1,) + + # get secondary headlines + h2_blocks = soup.find_all('div', id=['block-1-0', 'block-0-1']) + h2s = [] + for block in h2_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h2s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3_blocks = soup.find_all('div', id='block-0-2') + h3s = [] + for block in h3_blocks: + hblocks = block.find_all('h3', class_='story_headline') + for hblock in hblocks: + h3s += [x for x in hblock.find_all('a') if 'first-look' not in x['href']] + h3s = tuple(x['href'] for x in h3s) + + return h1s, h2s, h3s -- cgit v1.2.3 From ff01ea02a0cd85d7199455de1a053b57fdc27eee Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 17:46:18 -0400 Subject: CBS News closes #5 --- unbiased/sources/cbs.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 unbiased/sources/cbs.py diff --git a/unbiased/sources/cbs.py b/unbiased/sources/cbs.py new file mode 100644 index 0000000..295e671 --- /dev/null +++ b/unbiased/sources/cbs.py @@ -0,0 +1,37 @@ +from unbiased.sources.base import NewsSource + +class CBS(NewsSource): + + name = 'CBS News' + shortname = 'cbs' + url = 'https://www.cbsnews.com/' + + bad_titles = ['60 Minutes'] + bad_descriptions = ['60 Minutes'] + bad_urls = ['whats-in-the-news-coverart'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('h1', class_='title') + # sometimes they lead with a video + # if so, we'll pull the first h2 into the h1 slot later + if h1 is not None: + h1s = (h1.a['href'],) + + # get secondary headlines + h2s = soup.find('div', attrs={'data-tb-region': 'Big News Area Side Assets'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h2s = tuple(x.a['href'] for x in h2s) + if h1 is None: + h1s = (h2s[0],) + h2s = tuple(h2s[1:]) + + # get tertiary headlines + h3s = soup.find('div', attrs={'data-tb-region': 'Hard News'})\ + .ul.find_all('li', attrs={'data-tb-region-item': True}) + h3s = tuple(x.a['href'] for x in h3s[:5]) + + return h1s, h2s, h3s -- cgit v1.2.3 From fde7eb18c21626739936ab5072d8e537bc3a16de Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 18:44:06 -0400 Subject: NPR News closes #2 --- unbiased/sources/npr.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 unbiased/sources/npr.py diff --git a/unbiased/sources/npr.py b/unbiased/sources/npr.py new file mode 100644 index 0000000..e52459f --- /dev/null +++ b/unbiased/sources/npr.py @@ -0,0 +1,29 @@ +from unbiased.sources.base import NewsSource + +class NPR(NewsSource): + + name = 'NPR News' + shortname = 'npr' + url = 'http://www.npr.org/sections/news/' + + bad_titles = ['The Two-Way'] + bad_authors = ['Domenico Montanaro'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + featured = soup.find('div', class_='featured-3-up')\ + .find_all('article', recursive=False) + + h1s = featured[:1] + h1s = tuple(x.find('h2', class_='title').a['href'] for x in h1s) + h2s = featured[1:] + h2s = tuple(x.find('h2', class_='title').a['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find('div', id='overflow')\ + .find_all('article', recursive=False) + h3s = tuple(x.find('h2', class_='title').a['href'] for x in h3s[:5]) + + return h1s, h2s, h3s -- cgit v1.2.3 From 4fa6bb4c64e90eb5c3c11074cf83747f01bd7fd7 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:05:39 -0400 Subject: BBC News closes #3 --- unbiased/sources/bbc.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 unbiased/sources/bbc.py diff --git a/unbiased/sources/bbc.py b/unbiased/sources/bbc.py new file mode 100644 index 0000000..0dd0f80 --- /dev/null +++ b/unbiased/sources/bbc.py @@ -0,0 +1,26 @@ +from unbiased.sources.base import NewsSource + +class BBC(NewsSource): + + name = 'BBC News' + shortname = 'bbc' + url = 'http://www.bbc.com/news/world/us_and_canada' + + bad_images = ['bbc_news_logo.png'] + + @classmethod + def _fetch_urls(cls): + soup = cls._fetch_content(cls.url) + + h1s = soup.find('div', class_='buzzard-item')\ + .find('a', class_='title-link') + h1s = (h1s['href'],) + + h2s = soup.find_all('div', attrs={'class': 'pigeon__column', 'data-entityid': True}) + h2s = tuple(x.find('a', class_='title-link')['href'] for x in h2s) + + # get tertiary headlines + h3s = soup.find_all('div', attrs={'class': 'macaw-item', 'data-entityid': True}) + h3s = tuple(x.find('a', class_='title-link')['href'] for x in h3s) + + return h1s, h2s, h3s -- cgit v1.2.3 From 40842e8431a8c248c0d767c3b24a21d95bc136b4 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:31:11 -0400 Subject: debug mode causes exceptions to raise immediately --- unbiased/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index 7b057ea..89071b1 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -68,7 +68,7 @@ def main(): while True: logger.info('Starting crawl') start = time.time() - run(args.webroot, args.sources) + run(args.webroot, args.sources, args.debug) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -78,12 +78,11 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot, source_names): +def run(webroot, source_names, debug_mode=False): logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names)) sources = get_sources() - print(sources) if source_names is None: sources = sources.values() else: @@ -99,6 +98,8 @@ def run(webroot, source_names): built_sources.append(source.build()) break except Exception as ex: + if debug_mode is True: + raise tries += 1 if tries == 3: logger.error('Build failed. source={} ex={}'.format(source.name, ex)) -- cgit v1.2.3 From e2e504aac4b74d875da34b04f06b5409103d78e1 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:33:21 -0400 Subject: log number of each type of article fetched --- unbiased/sources/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 14d867e..861d7f4 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -50,6 +50,7 @@ class NewsSource(object): @classmethod def build(cls): h1s, h2s, h3s = cls._fetch_urls() + logger.debug('Fetched {} h1s, {} h2s, {} h3s'.format(len(h1s), len(h2s), len(h3s))) h1s = tuple(cls._normalize_url(x) for x in h1s) h2s = tuple(cls._normalize_url(x) for x in h2s) h3s = tuple(cls._normalize_url(x) for x in h3s) -- cgit v1.2.3 From 04e19c02ba74cfe6629afbb84fc427b92ce18850 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:35:35 -0400 Subject: code cleanup --- unbiased/sources/abc.py | 1 - unbiased/sources/base.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/unbiased/sources/abc.py b/unbiased/sources/abc.py index 2ea7aff..d9092a2 100644 --- a/unbiased/sources/abc.py +++ b/unbiased/sources/abc.py @@ -18,7 +18,6 @@ class ABC(NewsSource): h1 = soup.find('article', class_='hero')\ .find('div', class_='caption-wrapper').h1.a['href'] h1s = (h1,) - print(h1) # get secondary headlines h2s = soup.find('div', id='row-2')\ diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 861d7f4..9f51287 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -50,7 +50,7 @@ class NewsSource(object): @classmethod def build(cls): h1s, h2s, h3s = cls._fetch_urls() - logger.debug('Fetched {} h1s, {} h2s, {} h3s'.format(len(h1s), len(h2s), len(h3s))) + logger.info('Fetched {} h1s, {} h2s, {} h3s'.format(len(h1s), len(h2s), len(h3s))) h1s = tuple(cls._normalize_url(x) for x in h1s) h2s = tuple(cls._normalize_url(x) for x in h2s) h3s = tuple(cls._normalize_url(x) for x in h3s) -- cgit v1.2.3 From 4b5f6067f05b5e35555f8e55219808470f9d664f Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:45:13 -0400 Subject: code cleanup --- unbiased/main.py | 2 +- unbiased/parser.py | 986 ------------------------------------------ unbiased/spotCheck.py | 41 -- unbiased/unbiasedFunctions.py | 241 ----------- unbiased/unbiasedObjects.py | 97 ----- unbiased/util.py | 113 +++++ 6 files changed, 114 insertions(+), 1366 deletions(-) delete mode 100755 unbiased/parser.py delete mode 100755 unbiased/spotCheck.py delete mode 100644 unbiased/unbiasedFunctions.py delete mode 100644 unbiased/unbiasedObjects.py create mode 100644 unbiased/util.py diff --git a/unbiased/main.py b/unbiased/main.py index 89071b1..19fd05b 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -6,7 +6,7 @@ import logging import logging.config import time -from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, write_files, write_static_files +from unbiased.util import pickStories, pullImage, buildOutput, write_files, write_static_files from unbiased.sources import get_sources logger = logging.getLogger('unbiased') diff --git a/unbiased/parser.py b/unbiased/parser.py deleted file mode 100755 index 399e0f2..0000000 --- a/unbiased/parser.py +++ /dev/null @@ -1,986 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import os -import re -import urllib.parse - -from bs4 import BeautifulSoup -import requests - -from unbiased.unbiasedObjects import * -from unbiased.unbiasedFunctions import buildArticle - -logger = logging.getLogger('unbiased') - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url, sourceEncoding='utf8'): - res = requests.get(url) - if res.status_code == 200: - return res.text - else: - raise Exception("Failed to download {}".format(url)) - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - - url_parts = urllib.parse.urlparse(url) - scheme = url_parts.scheme - h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs] - h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs] - h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] - - h1Arr=[] - a=buildArticle(h1URLs[0], name) - if a==None: - logger.debug('H1 Nonetype in '+name) - else: - h1Arr.append(a) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - else: - logger.debug('H2 Nonetype in '+name) - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - else: - logger.debug('H3 Nonetype in '+name) - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - if x in h3s: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - logger.debug("""Story removed - SOURCE:\t{} - TITLE:\t{}) - REASON:\t{} - VALUE:\t{}""".format(source, title, reason, value)) - - -def removeBadStoriesHelper(source, element, badStringList, article_tiers): - if badStringList is None: - return - for tier, articles in enumerate(article_tiers): - for idx, article in enumerate(articles): - if article is None: - logger.debug("None type found in removeBadStoriesHelper for {}".format(source.name)) - break - for item in badStringList: - if item in getattr(article, element): - article_tiers[tier].remove(article) - # if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if tier == 0 and len(article_tiers[1]) > 0: - article_tiers[0].append(article_tiers[1][0]) - article_tiers[1].remove(article_tiers[1][0]) - removalNotification(source.name, article.title, element, item) - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - removeBadStoriesHelper(source, "title", badTitleArr, arr) - removeBadStoriesHelper(source, "description", badDescArr, arr) - removeBadStoriesHelper(source, "author", badAuthorArr, arr) - removeBadStoriesHelper(source, "img", badImgArr, arr) - removeBadStoriesHelper(source, "url", badURLArr, arr) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('
', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
', 1)[1] - h3=h3.split('', 1)[0] - while '
')[2:] - for x in h2: - if '

', 1)[1] - h3=h3.split('
', 1)[0]#'', 1)[0] - while '
  • ' in h2: - h2=h2.split('
  • ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '
    ', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('
    ', 1)[1] - h2=h2.split('
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('
    ' in h3: - h3=h3.split('
    ', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split(' a') - h2s = [x['href'] for x in h2s] - h2s = ['http:' + x if x.startswith('//') else x for x in h2s] - - #GET TERTIARY HEADLINES - h3s = [] - for ul in soup.find('section', id='latest').find_all('ul', recursive=False): - for li in ul.find_all('li', recursive=False): - h3s.append(li.find('a')['href']) - h3s = ['http:' + x if x.startswith('//') else x for x in h3s] - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] - badDescArr=['Sean Hannity'] - badAuthorArr=['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] - badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] - badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com'] - fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr) - - return fox - - - -def buildNYT(): - url='http://www.nytimes.com' - name='New York Times' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - #this will likely need if/else logic - h1=content - - if 'story theme-summary banner' in h1: - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('', 1)[1] - h1=h1.split('
    ', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h3=h3.split('', 1)[0] - #remove "collection" sets - while '
    ' in h3: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '' in h3: - h3=h3.split('', 1)[1] - h3=h3.split('', 1)[0] - elif '/video/the-daily-360' in h3: - h3=h3.split('/video/the-daily-360')[-1] - h3=h3.split('More News', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - - #Grab the remaining URLs - while ' - -
    - -
    -

    Top News

    - -
    - - -''' diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py deleted file mode 100755 index 7ce50d3..0000000 --- a/unbiased/spotCheck.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -from unbiased.parser import * -from unbiased.unbiasedObjects import * - -def spotCheck(src): - - fns = {'hil' : buildTheHill, - 'cbs' : buildCBS, - 'npr' : buildNPR, - 'fox' : buildFoxNews, - 'gdn' : buildGuardian, - 'blz' : buildBlaze, - 'bbc' : buildBBC, - 'nbc' : buildNBC, - 'wat' : buildWashTimes, - 'csm' : buildCSM, - 'abc' : buildABC} - - data=fns[src]() - - print('H1s:\n--------------') - for h in data.h1Arr: - print(h.title) - - print('\n\nH2s:\n--------------') - for h in data.h2Arr: - print(h.title) - - print('\n\nH3s:\n--------------') - for h in data.h3Arr: - print(h.title) - - print('\n\n') - - - -if __name__=='__main__': - spotCheck(sys.argv[1]) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py deleted file mode 100644 index b07245c..0000000 --- a/unbiased/unbiasedFunctions.py +++ /dev/null @@ -1,241 +0,0 @@ -import html -import io -import logging -import os -import pkgutil -import random -import re -import shutil -import time -import urllib.parse - -from PIL import Image -import requests - -logger = logging.getLogger('unbiased') - -#take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): - - debugging=False - if debugging: - logger.debug(sourceName) - logger.debug(url) - - url_parts = urllib.parse.urlparse(url) - scheme = url_parts.scheme - - #download url - try: - res = requests.get(url) - except Exception as ex: - logger.debug("""ARTICLE DOWNLOADING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - if res.status_code == 200: - content = res.text - else: - logger.debug("""ARTICLE DOWNLOADING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - try: - if sourceName=='The Guardian US': - #The Guardian puts an identifying banner on their og:images - #grab the main image from the page instead - - #scenario 1: regular image - if '')[0] - elif sourceName=='ABC News': - img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' - if img[-1]=='/': - #because the quote separator could be ' or ", - #trim to just before it then lop it off - img=img[:-1].strip() - img=img[:-1] - # fix the scheme if it's missing - img = urllib.parse.urlparse(img, scheme=scheme).geturl() - - if debugging: - logger.debug(img) - - title=content.split('og:title" content=')[1][1:].split('>')[0] - if title[-1]=='/': - title=title[:-1].strip() - title=title[:-1] - - if debugging: - logger.debug(title) - - - author='' - if sourceName=='The Blaze': - if 'class="article-author">' in content: - author=content.split('class="article-author">')[1].split('<')[0] - elif 'class="article-author" href="' in content: - author=content.split('class="article-author" href="')[1] - author=author.split('>')[1].split('<')[0].strip() - else: - authorTags=['article:author', 'dc.creator', 'property="author'] - for tag in authorTags: - if tag in content: - author=content.split(tag+'" content=')[1][1:].split('>')[0] - author=author[:-1] - #trim an extra quotation mark for The Hill - if sourceName=='The Hill': - author=author.split('"', 1)[0] - break - - if debugging: - logger.debug(author) - - - if 'og:description' in content: - description=content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1]=='/': - description=description[:-1].strip() - description=description[:-1] - else: - if sourceName=='The Hill': - description=content.split('div class="field-items"')[-1] - description=re.sub('<[^<]+?>', '', description) - description=description[1:200] - else: - logger.debug("SHOULDN'T GET HERE") - - #strip out self-references - description=description.replace(sourceName+"'s", '***') - description=description.replace(sourceName+"'", '***') - description=description.replace(sourceName, '***') - - if debugging: - logger.debug(description) - - - a=Article(html.unescape(title), url, img, html.unescape(description), sourceName, html.unescape(author)) - return a - - except Exception: - logger.debug("""ARTICLE PARSING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - -def pick_randoms(story_lists, length, per_source): - """ - Return a randomly chosen list of 'length' stories, picking at - most 'per_source' stories from each source. - """ - # TODO: weighting is incorrect if a source has fewer than 'per_source' articles - urandom = random.SystemRandom() - candidates = [] - for stories in story_lists: - indexes = list(range(len(stories))) - urandom.shuffle(indexes) - random_indexes = indexes[:per_source] - candidates.extend([stories[x] for x in random_indexes]) - indexes = list(range(len(candidates))) - urandom.shuffle(indexes) - random_indexes = indexes[:length] - return tuple(candidates[x] for x in random_indexes) - - -def pickStories(newsSourceArr): - h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) - h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) - h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) - return h1s, h2s, h3s - -def buildOutput(top_stories, middle_stories, bottom_stories): - #read in the template html file - from jinja2 import Environment, PackageLoader, select_autoescape - env = Environment( - loader=PackageLoader('unbiased', 'html_template'), - autoescape=select_autoescape(['html', 'xml']) - ) - template = env.get_template('unbiased.jinja.html') - - timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - utime = int(time.time()) - - sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) - - html = template.render( - timestamp = timestamp, - utime = utime, - top_stories = top_stories, - middle_stories = middle_stories, - bottom_stories = bottom_stories, - sources = sourcesStr, - ) - - #return updated text - return html - -def write_files(files_to_write, outDir): - for name, bytesio in files_to_write.items(): - with open(os.path.join(outDir, name), 'wb') as fp: - shutil.copyfileobj(bytesio, fp) - -def write_static_files(outDir): - # copy over static package files - for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: - data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) - with open(os.path.join(outDir, filename), 'wb') as fp: - fp.write(data) - -def pullImage(url, index, webroot, target_width=350, target_height=200): - extension = url.split('.')[-1].split('?')[0] - img_name = 'img{}.{}'.format(index, extension) - res = requests.get(url) - if res.status_code == 200: - content = res.content - else: - logger.debug('Image not found: url={}'.format(url)) - return '' - img = Image.open(io.BytesIO(content)) - # crop to aspect ratio - target_ar = target_width / target_height - left, top, right, bottom = img.getbbox() - height = bottom - top - width = right - left - ar = width / height - if target_ar > ar: - new_height = (target_height / target_width) * width - bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) - img = img.crop(bbox) - elif target_ar < ar: - new_width = (target_width / target_height) * height - bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) - img = img.crop(bbox) - # resize if larger - if target_width * 2 < width or target_height * 2 < height: - img = img.resize((target_width*2, target_height*2), Image.LANCZOS) - # TODO: fill with a neutral color instead of just discarding alpha channel - img = img.convert('RGB') - # TODO: create retina images - jpg_name = 'img{}.jpg'.format(index) - jpg_file = io.BytesIO() - out_file = os.path.join(webroot, jpg_name) - img.save(jpg_file, 'JPEG') - jpg_file.seek(0) - return jpg_name, jpg_file diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py deleted file mode 100644 index 9a8a78a..0000000 --- a/unbiased/unbiasedObjects.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging - -logger = logging.getLogger('unbiased') - -class Article(): - title='' - url='' - img='' - description='' - source='' - author='' - - def __init__(self, title, url, img, description, source, author): - self.title=title - self.url=url - self.img=img - self.description=description - self.source=source - self.author=author - - def __str__(self): - return '-----------\ntitle: {}\nauthor: {}\nsource: {}\ndescription: {}\nurl: {}\nimg: {}\n-----------'.format(self.title, self.author, self.source, self.description, self.url, self.img) - - def __repr__(self): - return '{}({}, {}, {})'.format(self.source.replace(' ', ''), self.title, self.author, self.url) - - -class NewsSource2(): - name='' - url='' - h1Arr=[] - h2Arr=[] - h3Arr=[] - def __init__(self, name, url, h1Arr, h2Arr, h3Arr): - self.name=name - self.url=url - self.h1Arr=h1Arr - self.h2Arr=h2Arr - self.h3Arr=h3Arr - - - -class NewsSource(): - name='' - url='' - #multiple start values to step through file. end value default to '"' - h1SectionDividerStart=None - h1SectionDividerEnd=None - h1DelStart=[] - h1DelEnd='"' - h2SectionDividerStart=None - h2SectionDividerEnd=None - h2DelStart=[] - h2DelEnd='"' - h3SectionDividerStart=None - h3SectionDividerEnd=None - h3DelStart=[] - h3DelEnd='"' - #arrays of Article object types - h1Arr=None - h2Arr=None - h3Arr=None - #url to attach to stub links - stubURL='' - - def __init__(self, name, url, - h1DelStart, h2DelStart, h3DelStart, - h1SectionDividerStart=None, h1SectionDividerEnd=None, - h2SectionDividerStart=None, h2SectionDividerEnd=None, - h3SectionDividerStart=None, h3SectionDividerEnd=None, - stubURL=None): - self.name=name - self.url=url - self.h1DelStart=h1DelStart - self.h2DelStart=h2DelStart - self.h3DelStart=h3DelStart - self.h1SectionDividerStart=h1SectionDividerStart - self.h2SectionDividerStart=h2SectionDividerStart - self.h3SectionDividerStart=h3SectionDividerStart - self.h1SectionDividerEnd=h1SectionDividerEnd - self.h2SectionDividerEnd=h2SectionDividerEnd - self.h3SectionDividerEnd=h3SectionDividerEnd - self.h1Arr=[] - self.h2Arr=[] - self.h3Arr=[] - self.stubURL=stubURL - - def addArticle(self, article, level): - if level==1: - self.h1Arr.append(article) - elif level==2: - self.h2Arr.append(article) - elif level==3: - self.h3Arr.append(article) - else: - logger.debug("Invalid level in NewsSource.addArtlce: " + level) - diff --git a/unbiased/util.py b/unbiased/util.py new file mode 100644 index 0000000..12003b1 --- /dev/null +++ b/unbiased/util.py @@ -0,0 +1,113 @@ +import io +import logging +import os +import pkgutil +import random +import shutil +import time + +from PIL import Image +import requests + +logger = logging.getLogger('unbiased') + + +def pick_randoms(story_lists, length, per_source): + """ + Return a randomly chosen list of 'length' stories, picking at + most 'per_source' stories from each source. + """ + # TODO: weighting is incorrect if a source has fewer than 'per_source' articles + urandom = random.SystemRandom() + candidates = [] + for stories in story_lists: + indexes = list(range(len(stories))) + urandom.shuffle(indexes) + random_indexes = indexes[:per_source] + candidates.extend([stories[x] for x in random_indexes]) + indexes = list(range(len(candidates))) + urandom.shuffle(indexes) + random_indexes = indexes[:length] + return tuple(candidates[x] for x in random_indexes) + + +def pickStories(newsSourceArr): + h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) + h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) + h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) + return h1s, h2s, h3s + + +def buildOutput(top_stories, middle_stories, bottom_stories): + # read in the template html file + from jinja2 import Environment, PackageLoader, select_autoescape + env = Environment( + loader=PackageLoader('unbiased', 'html_template'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('unbiased.jinja.html') + + timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + utime = int(time.time()) + + sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) + + html = template.render( + timestamp=timestamp, + utime=utime, + top_stories=top_stories, + middle_stories=middle_stories, + bottom_stories=bottom_stories, + sources=sourcesStr, + ) + + return html + + +def write_files(files_to_write, outDir): + for name, bytesio in files_to_write.items(): + with open(os.path.join(outDir, name), 'wb') as fp: + shutil.copyfileobj(bytesio, fp) + + +def write_static_files(outDir): + # copy over static package files + for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: + data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) + with open(os.path.join(outDir, filename), 'wb') as fp: + fp.write(data) + + +def pullImage(url, index, webroot, target_width=350, target_height=200): + res = requests.get(url) + if res.status_code == 200: + content = res.content + else: + logger.debug('Image not found: url={}'.format(url)) + return '' + img = Image.open(io.BytesIO(content)) + # crop to aspect ratio + target_ar = target_width / target_height + left, top, right, bottom = img.getbbox() + height = bottom - top + width = right - left + ar = width / height + if target_ar > ar: + new_height = (target_height / target_width) * width + bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) + img = img.crop(bbox) + elif target_ar < ar: + new_width = (target_width / target_height) * height + bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) + img = img.crop(bbox) + # resize if larger + if target_width * 2 < width or target_height * 2 < height: + img = img.resize((target_width * 2, target_height * 2), Image.LANCZOS) + # TODO: fill with a neutral color instead of just discarding alpha channel + img = img.convert('RGB') + # TODO: create retina images + jpg_name = 'img{}.jpg'.format(index) + jpg_file = io.BytesIO() + img.save(jpg_file, 'JPEG') + jpg_file.seek(0) + return jpg_name, jpg_file -- cgit v1.2.3