From d0c7c0541013cc9472b38ccfd614a314e9a86d70 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 10 Sep 2017 13:09:12 -0400 Subject: major refactor of news source building --- requirements.txt | 2 +- unbiased/main.py | 62 +++++------- unbiased/sources/__init__.py | 10 ++ unbiased/sources/base.py | 222 ++++++++++++++++++++++++++++++++++++++++++ unbiased/sources/fox.py | 41 ++++++++ unbiased/unbiasedFunctions.py | 87 +++++------------ 6 files changed, 320 insertions(+), 104 deletions(-) create mode 100644 unbiased/sources/__init__.py create mode 100644 unbiased/sources/base.py create mode 100644 unbiased/sources/fox.py diff --git a/requirements.txt b/requirements.txt index 3767095..0d53cea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ beautifulsoup4~=4.6.0 Jinja2~=2.9.6 -lxml=~=3.8.0 +lxml~=3.8.0 Pillow~=4.2.1 requests~=2.18.4 diff --git a/unbiased/main.py b/unbiased/main.py index 4ccda24..949e646 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -5,9 +5,8 @@ import logging import logging.config import time -from unbiased.unbiasedObjects import * -from unbiased.unbiasedFunctions import * -from unbiased.parser import * +from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, writeOutputHTML +from unbiased.sources import get_sources logger = logging.getLogger('unbiased') @@ -52,6 +51,7 @@ def main(): parser.add_argument('-l', '--log-dir', help='location to write detailed logs') parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode') parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit') + parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None) args = parser.parse_args() if args.log_dir: @@ -67,7 +67,7 @@ def main(): while True: logger.info('Starting crawl') start = time.time() - run(args.webroot) + run(args.webroot, args.sources) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -77,51 +77,33 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot): - sources = [] - - ''' - SOURCES TO ADD NEXT: - -REUTERS - -Town Hall - ''' - - logger.debug('Running with webroot="{}"'.format(webroot)) - - ### These values have to be the second half of the function name - ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr = [ - 'Guardian', - 'TheHill', - 'NPR', - 'BBC', - 'NBC', - 'CBS', - 'FoxNews', - 'WashTimes', - 'CSM', - 'ABC', - ] - - for source in sourceFnArr: - logger.info('Crawling {}'.format(source)) +def run(webroot, source_names): + + logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names)) + + sources = get_sources() + print(sources) + if source_names is None: + sources = sources.values() + else: + sources = [sources[x] for x in source_names] + + built_sources = [] + for source in sources: + logger.info('Crawling {}'.format(source.name)) tries = 0 while tries < 3: time.sleep(tries) try: - fn = 'build' + source - possibles = globals().copy() - possibles.update(locals()) - method = possibles.get(fn) - src = method() - sources.append(src) + built_sources.append(source.build()) break except Exception as ex: tries += 1 if tries == 3: - logger.error('Build failed. source={} ex={}'.format(source, ex)) + logger.error('Build failed. source={} ex={}'.format(source.name, ex)) else: - logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex)) + sources = tuple(built_sources) logger.info('Parsed home pages for: {}'.format([x.name for x in sources])) top_stories, middle_stories, bottom_stories = pickStories(sources) diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py new file mode 100644 index 0000000..e4a473a --- /dev/null +++ b/unbiased/sources/__init__.py @@ -0,0 +1,10 @@ +import importlib +import pkgutil + +from unbiased.sources.base import NewsSource + +def get_sources(): + for loader, name, is_pkg in pkgutil.walk_packages(__path__): + if name != 'base': + importlib.import_module('unbiased.sources.' + name) + return {x.shortname.lower(): x for x in NewsSource.__subclasses__()} diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py new file mode 100644 index 0000000..9dc14fd --- /dev/null +++ b/unbiased/sources/base.py @@ -0,0 +1,222 @@ +import collections +import html +import logging +import urllib + +from bs4 import BeautifulSoup +import requests + +logger = logging.getLogger('unbiased') + +class Article(object): + + def __init__(self, source, title, author, description, url, img): + self.source = source + self.title = title + self.author = author + self.description = description + self.url = url + self.img = img + + def __repr__(self): + return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img) + +class NewsSource(object): + """ + Abstract base class. + To implement: + - set 'name', 'shortname', and 'url' + - set 'bad_' variables to blacklist terms and phrases + - implement '_fetch_urls()', which should return three tuples + of urls, one for each tier + - override any of the '_get_*()' functions as necessary + """ + # TODO: replace all string parsing with bs4 + + name = None + shortname = None + url = None + + bad_titles = None + bad_authors = None + bad_descriptions = None + bad_imgs = None + bad_urls = None + + def __init__(self, h1s, h2s, h3s): + self.h1s = h1s + self.h2s = h2s + self.h3s = h3s + + @classmethod + def build(cls): + h1s, h2s, h3s = cls._fetch_urls() + h1s = tuple(cls._fix_url(x) for x in h1s) + h2s = tuple(cls._fix_url(x) for x in h2s) + h3s = tuple(cls._fix_url(x) for x in h3s) + h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s) + h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s) + h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s) + return cls(h1s, h2s, h3s) + + @classmethod + def _fetch_content(cls, url): + res = requests.get(url) + if res.status_code == 200: + content = res.text + else: + raise Exception("Failed to download {}".format(url)) + return BeautifulSoup(content, 'lxml') + + @classmethod + def _fix_url(cls, url, scheme='http'): + """ + Make sure they have a scheme. + Trim any query parameters. + """ + # TODO: proper URL parsing + if url.startswith('//'): + url = '{}:{}'.format(scheme, x) + url = url.split('?')[0] + return url + + @classmethod + def _remove_duplicates(cls, h1s, h2s, h3s): + h2s = tuple(x for x in h2s if x not in h1s) + h3s = tuple(x for x in h3s if x not in h1s and x not in h2s) + return h1s, h2s, h3s + + @classmethod + def _remove_bad_stories(cls, articles, element, filters): + if filters is None: + return articles + new_articles = [] + for article in articles: + save = True + for f in filters: + if f in getattr(article, element): + save = False + break + if save: + new_articles.append(article) + return tuple(new_articles) + + @classmethod + def _remove_all_bad_stories(cls, h1s, h2s, h3s): + new_articles = [] + for articles in [h1s, h2s, h3s]: + articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles) + articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions) + articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors) + articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs) + articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls) + new_articles.append(articles) + if len(new_articles[0]) == 0 and len(new_articles[1]) > 0: + new_articles[0].append(new_articles[1].pop()) + return tuple(tuple(x) for x in new_articles) + + @classmethod + def _fetch_articles(cls, h1s, h2s, h3s): + ret = [] + for urls in [h1s, h2s, h3s]: + articles = [] + for url in urls: + article = cls._fetch_article(url) + if article is not None: + articles.append(article) + ret.append(articles) + return tuple(tuple(x) for x in ret) + + @classmethod + def _fetch_article(cls, url): + #soup = cls._fetch_content(url) + + logger.debug(cls.name) + logger.debug(url) + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + + # download url + try: + res = requests.get(url) + except Exception as ex: + logger.debug("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + if res.status_code == 200: + content = res.text + else: + logger.debug("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + try: + img = cls._get_image(content) + img = urllib.parse.urlparse(img, scheme=scheme).geturl() + logger.debug(img) + + title = cls._get_title(content) + logger.debug(title) + + author = cls._get_author(content) + logger.debug(author) + + description = cls._get_description(content) + logger.debug(description) + description = cls._remove_self_refs(description) + logger.debug(description) + except Exception: + logger.debug("""ARTICLE PARSING ERROR + SOURCE:\t{} + URL:\t{}""".format(cls.name, url)) + return None + + return Article(cls.name, title, author, description, url, img) + + @classmethod + def _get_image(cls, content): + img = content.split('og:image" content=')[1][1:].split('>')[0] + if img[-1] == '/': + #because the quote separator could be ' or ", + #trim to just before it then lop it off + img = img[:-1].strip() + img = img[:-1] + return img + + @classmethod + def _get_title(cls, content): + title=content.split('og:title" content=')[1][1:].split('>')[0] + if title[-1]=='/': + title=title[:-1].strip() + title=title[:-1] + return title + + @classmethod + def _get_author(cls, content): + author = '' + authorTags = ['article:author', 'dc.creator', 'property="author'] + for tag in authorTags: + if tag in content: + author = content.split(tag+'" content=')[1][1:].split('>')[0] + author = author[:-1] + break + return author + + @classmethod + def _get_description(cls, content): + description = content.split('og:description" content=')[1][1:].split('>')[0] + if description[-1] == '/': + description = description[:-1].strip() + description = description[:-1] + return description + + @classmethod + def _remove_self_refs(cls, description): + description = description.replace(cls.name+"'s", '***') + description = description.replace(cls.name+"'", '***') + description = description.replace(cls.name, '***') + return description diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py new file mode 100644 index 0000000..ce7730f --- /dev/null +++ b/unbiased/sources/fox.py @@ -0,0 +1,41 @@ +from unbiased.sources.base import NewsSource + +class Fox(NewsSource): + + name = 'Fox News' + shortname = 'Fox' + url = 'http://www.foxnews.com' + + bad_titles = ['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] + bad_descriptions = ['Sean Hannity'] + bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] + bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] + bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com'] + + @classmethod + def _fetch_urls(cls): + """ + Returns three tuples of urls, one for each of + the three tiers. + """ + soup = cls._fetch_content(cls.url) + + # get primary headline + h1 = soup.find('div', id='big-top')\ + .find('div', class_='primary')\ + .find('h1')\ + .find('a')['href'] + h1s = (h1,) + + # get secondary headlines + h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a') + h2s = tuple(x['href'] for x in h2s) + + # get tertiary headlines + h3s = [] + for ul in soup.find('section', id='latest').find_all('ul', recursive=False): + for li in ul.find_all('li', recursive=False): + h3s.append(li.find('a')['href']) + h3s = tuple(h3s) + + return h1s, h2s, h3s diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 6ec89b7..7825d93 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -11,8 +11,6 @@ import urllib.parse from PIL import Image import requests -from unbiased.unbiasedObjects import * - logger = logging.getLogger('unbiased') #take in a url and delimiters, return twitter card @@ -141,69 +139,30 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, return None -def pickStories(newsSourceArr): - # TODO: refactor to avoid infinite loops - #set the random order for sources - h1RandomSources=[] - guard = 0 - while len(h1RandomSources)<4: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h1Arr)>0: - if x not in h1RandomSources: - h1RandomSources.append(x) - else: - logger.debug('No H1 stories in '+newsSourceArr[x].name) - guard += 1 - if guard > 100: - return [], [], [] - - #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within - h2RandomPairs=[] - while len(h2RandomPairs) < 6: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h2Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] - pair=[x,y] - if not pair in h2RandomPairs: - h2RandomPairs.append(pair) - else: - logger.debug('No H2 stories in '+newsSourceArr[x].name) - - h3RandomPairs=[] - while len(h3RandomPairs) < 12: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h3Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] - pair=[x,y] - if not pair in h3RandomPairs: - h3RandomPairs.append(pair) - else: - logger.debug('No H3 stories in '+newsSourceArr[x].name) +def pick_randoms(story_lists, length, per_source): + """ + Return a randomly chosen list of 'length' stories, picking at + most 'per_source' stories from each source. + """ + # TODO: weighting is incorrect if a source has fewer than 'per_source' articles + urandom = random.SystemRandom() + candidates = [] + for stories in story_lists: + indexes = list(range(len(stories))) + urandom.shuffle(indexes) + random_indexes = indexes[:per_source] + candidates.extend([stories[x] for x in random_indexes]) + indexes = list(range(len(candidates))) + urandom.shuffle(indexes) + random_indexes = indexes[:length] + return tuple(candidates[x] for x in random_indexes) - # collect articles for each section - image_index = 0 - top_stories = [] - for i in range(len(h1RandomSources)): - source=newsSourceArr[h1RandomSources[i]] - randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] - article=source.h1Arr[randomArticle] - top_stories.append(article) - - middle_stories = [] - for i in range(len(h2RandomPairs)): - pair=h2RandomPairs[i] - article=newsSourceArr[pair[0]].h2Arr[pair[1]] - middle_stories.append(article) - - bottom_stories = [] - for i in range(len(h3RandomPairs)): - pair=h3RandomPairs[i] - article=newsSourceArr[pair[0]].h3Arr[pair[1]] - bottom_stories.append(article) - - return top_stories, middle_stories, bottom_stories +def pickStories(newsSourceArr): + h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) + h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) + h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) + return h1s, h2s, h3s def buildOutput(top_stories, middle_stories, bottom_stories): #read in the template html file @@ -270,6 +229,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200): # resize if larger if target_width * 2 < width or target_height * 2 < height: img = img.resize((target_width*2, target_height*2), Image.LANCZOS) + # TODO: fill with a neutral color instead of just discarding alpha channel + img = img.convert('RGB') # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) out_file = os.path.join(webroot, jpg_name) -- cgit v1.2.3