summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <matt@xcolour.net>2017-09-10 13:09:12 -0400
committerMatt Singleton <matt@xcolour.net>2017-09-10 13:09:12 -0400
commitd0c7c0541013cc9472b38ccfd614a314e9a86d70 (patch)
tree761aa2e7cec0e9d7ce25b516aef5875b962fc032
parent38a64b344bc6a25ce0faf17ddb7ed3439d0d007d (diff)
major refactor of news source building
-rw-r--r--requirements.txt2
-rwxr-xr-xunbiased/main.py62
-rw-r--r--unbiased/sources/__init__.py10
-rw-r--r--unbiased/sources/base.py222
-rw-r--r--unbiased/sources/fox.py41
-rw-r--r--unbiased/unbiasedFunctions.py87
6 files changed, 320 insertions, 104 deletions
diff --git a/requirements.txt b/requirements.txt
index 3767095..0d53cea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
beautifulsoup4~=4.6.0
Jinja2~=2.9.6
-lxml=~=3.8.0
+lxml~=3.8.0
Pillow~=4.2.1
requests~=2.18.4
diff --git a/unbiased/main.py b/unbiased/main.py
index 4ccda24..949e646 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -5,9 +5,8 @@ import logging
import logging.config
import time
-from unbiased.unbiasedObjects import *
-from unbiased.unbiasedFunctions import *
-from unbiased.parser import *
+from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, writeOutputHTML
+from unbiased.sources import get_sources
logger = logging.getLogger('unbiased')
@@ -52,6 +51,7 @@ def main():
parser.add_argument('-l', '--log-dir', help='location to write detailed logs')
parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode')
parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit')
+ parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None)
args = parser.parse_args()
if args.log_dir:
@@ -67,7 +67,7 @@ def main():
while True:
logger.info('Starting crawl')
start = time.time()
- run(args.webroot)
+ run(args.webroot, args.sources)
finish = time.time()
runtime = finish - start
sleeptime = crawl_frequency - runtime
@@ -77,51 +77,33 @@ def main():
if sleeptime > 0:
time.sleep(sleeptime)
-def run(webroot):
- sources = []
-
- '''
- SOURCES TO ADD NEXT:
- -REUTERS
- -Town Hall
- '''
-
- logger.debug('Running with webroot="{}"'.format(webroot))
-
- ### These values have to be the second half of the function name
- ### E.g. Guardian calls buildGuardian(), etc.
- sourceFnArr = [
- 'Guardian',
- 'TheHill',
- 'NPR',
- 'BBC',
- 'NBC',
- 'CBS',
- 'FoxNews',
- 'WashTimes',
- 'CSM',
- 'ABC',
- ]
-
- for source in sourceFnArr:
- logger.info('Crawling {}'.format(source))
+def run(webroot, source_names):
+
+ logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names))
+
+ sources = get_sources()
+ print(sources)
+ if source_names is None:
+ sources = sources.values()
+ else:
+ sources = [sources[x] for x in source_names]
+
+ built_sources = []
+ for source in sources:
+ logger.info('Crawling {}'.format(source.name))
tries = 0
while tries < 3:
time.sleep(tries)
try:
- fn = 'build' + source
- possibles = globals().copy()
- possibles.update(locals())
- method = possibles.get(fn)
- src = method()
- sources.append(src)
+ built_sources.append(source.build())
break
except Exception as ex:
tries += 1
if tries == 3:
- logger.error('Build failed. source={} ex={}'.format(source, ex))
+ logger.error('Build failed. source={} ex={}'.format(source.name, ex))
else:
- logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
+ logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex))
+ sources = tuple(built_sources)
logger.info('Parsed home pages for: {}'.format([x.name for x in sources]))
top_stories, middle_stories, bottom_stories = pickStories(sources)
diff --git a/unbiased/sources/__init__.py b/unbiased/sources/__init__.py
new file mode 100644
index 0000000..e4a473a
--- /dev/null
+++ b/unbiased/sources/__init__.py
@@ -0,0 +1,10 @@
+import importlib
+import pkgutil
+
+from unbiased.sources.base import NewsSource
+
+def get_sources():
+ for loader, name, is_pkg in pkgutil.walk_packages(__path__):
+ if name != 'base':
+ importlib.import_module('unbiased.sources.' + name)
+ return {x.shortname.lower(): x for x in NewsSource.__subclasses__()}
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
new file mode 100644
index 0000000..9dc14fd
--- /dev/null
+++ b/unbiased/sources/base.py
@@ -0,0 +1,222 @@
+import collections
+import html
+import logging
+import urllib
+
+from bs4 import BeautifulSoup
+import requests
+
+logger = logging.getLogger('unbiased')
+
+class Article(object):
+
+ def __init__(self, source, title, author, description, url, img):
+ self.source = source
+ self.title = title
+ self.author = author
+ self.description = description
+ self.url = url
+ self.img = img
+
+ def __repr__(self):
+ return 'Article({}, {}, {}, {}, {}, {})'.format(self.source, self.title, self.author, self.description, self.url, self.img)
+
+class NewsSource(object):
+ """
+ Abstract base class.
+ To implement:
+ - set 'name', 'shortname', and 'url'
+ - set 'bad_' variables to blacklist terms and phrases
+ - implement '_fetch_urls()', which should return three tuples
+ of urls, one for each tier
+ - override any of the '_get_*()' functions as necessary
+ """
+ # TODO: replace all string parsing with bs4
+
+ name = None
+ shortname = None
+ url = None
+
+ bad_titles = None
+ bad_authors = None
+ bad_descriptions = None
+ bad_imgs = None
+ bad_urls = None
+
+ def __init__(self, h1s, h2s, h3s):
+ self.h1s = h1s
+ self.h2s = h2s
+ self.h3s = h3s
+
+ @classmethod
+ def build(cls):
+ h1s, h2s, h3s = cls._fetch_urls()
+ h1s = tuple(cls._fix_url(x) for x in h1s)
+ h2s = tuple(cls._fix_url(x) for x in h2s)
+ h3s = tuple(cls._fix_url(x) for x in h3s)
+ h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s)
+ h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s)
+ h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s)
+ return cls(h1s, h2s, h3s)
+
+ @classmethod
+ def _fetch_content(cls, url):
+ res = requests.get(url)
+ if res.status_code == 200:
+ content = res.text
+ else:
+ raise Exception("Failed to download {}".format(url))
+ return BeautifulSoup(content, 'lxml')
+
+ @classmethod
+ def _fix_url(cls, url, scheme='http'):
+ """
+ Make sure they have a scheme.
+ Trim any query parameters.
+ """
+ # TODO: proper URL parsing
+ if url.startswith('//'):
+ url = '{}:{}'.format(scheme, x)
+ url = url.split('?')[0]
+ return url
+
+ @classmethod
+ def _remove_duplicates(cls, h1s, h2s, h3s):
+ h2s = tuple(x for x in h2s if x not in h1s)
+ h3s = tuple(x for x in h3s if x not in h1s and x not in h2s)
+ return h1s, h2s, h3s
+
+ @classmethod
+ def _remove_bad_stories(cls, articles, element, filters):
+ if filters is None:
+ return articles
+ new_articles = []
+ for article in articles:
+ save = True
+ for f in filters:
+ if f in getattr(article, element):
+ save = False
+ break
+ if save:
+ new_articles.append(article)
+ return tuple(new_articles)
+
+ @classmethod
+ def _remove_all_bad_stories(cls, h1s, h2s, h3s):
+ new_articles = []
+ for articles in [h1s, h2s, h3s]:
+ articles = cls._remove_bad_stories(articles, 'title', cls.bad_titles)
+ articles = cls._remove_bad_stories(articles, 'description', cls.bad_descriptions)
+ articles = cls._remove_bad_stories(articles, 'author', cls.bad_authors)
+ articles = cls._remove_bad_stories(articles, 'img', cls.bad_imgs)
+ articles = cls._remove_bad_stories(articles, 'url', cls.bad_urls)
+ new_articles.append(articles)
+ if len(new_articles[0]) == 0 and len(new_articles[1]) > 0:
+ new_articles[0].append(new_articles[1].pop())
+ return tuple(tuple(x) for x in new_articles)
+
+ @classmethod
+ def _fetch_articles(cls, h1s, h2s, h3s):
+ ret = []
+ for urls in [h1s, h2s, h3s]:
+ articles = []
+ for url in urls:
+ article = cls._fetch_article(url)
+ if article is not None:
+ articles.append(article)
+ ret.append(articles)
+ return tuple(tuple(x) for x in ret)
+
+ @classmethod
+ def _fetch_article(cls, url):
+ #soup = cls._fetch_content(url)
+
+ logger.debug(cls.name)
+ logger.debug(url)
+
+ url_parts = urllib.parse.urlparse(url)
+ scheme = url_parts.scheme
+
+ # download url
+ try:
+ res = requests.get(url)
+ except Exception as ex:
+ logger.debug("""ARTICLE DOWNLOADING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(cls.name, url))
+ return None
+
+ if res.status_code == 200:
+ content = res.text
+ else:
+ logger.debug("""ARTICLE DOWNLOADING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(cls.name, url))
+ return None
+
+ try:
+ img = cls._get_image(content)
+ img = urllib.parse.urlparse(img, scheme=scheme).geturl()
+ logger.debug(img)
+
+ title = cls._get_title(content)
+ logger.debug(title)
+
+ author = cls._get_author(content)
+ logger.debug(author)
+
+ description = cls._get_description(content)
+ logger.debug(description)
+ description = cls._remove_self_refs(description)
+ logger.debug(description)
+ except Exception:
+ logger.debug("""ARTICLE PARSING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(cls.name, url))
+ return None
+
+ return Article(cls.name, title, author, description, url, img)
+
+ @classmethod
+ def _get_image(cls, content):
+ img = content.split('og:image" content=')[1][1:].split('>')[0]
+ if img[-1] == '/':
+ #because the quote separator could be ' or ",
+ #trim to just before it then lop it off
+ img = img[:-1].strip()
+ img = img[:-1]
+ return img
+
+ @classmethod
+ def _get_title(cls, content):
+ title=content.split('og:title" content=')[1][1:].split('>')[0]
+ if title[-1]=='/':
+ title=title[:-1].strip()
+ title=title[:-1]
+ return title
+
+ @classmethod
+ def _get_author(cls, content):
+ author = ''
+ authorTags = ['article:author', 'dc.creator', 'property="author']
+ for tag in authorTags:
+ if tag in content:
+ author = content.split(tag+'" content=')[1][1:].split('>')[0]
+ author = author[:-1]
+ break
+ return author
+
+ @classmethod
+ def _get_description(cls, content):
+ description = content.split('og:description" content=')[1][1:].split('>')[0]
+ if description[-1] == '/':
+ description = description[:-1].strip()
+ description = description[:-1]
+ return description
+
+ @classmethod
+ def _remove_self_refs(cls, description):
+ description = description.replace(cls.name+"'s", '***')
+ description = description.replace(cls.name+"'", '***')
+ description = description.replace(cls.name, '***')
+ return description
diff --git a/unbiased/sources/fox.py b/unbiased/sources/fox.py
new file mode 100644
index 0000000..ce7730f
--- /dev/null
+++ b/unbiased/sources/fox.py
@@ -0,0 +1,41 @@
+from unbiased.sources.base import NewsSource
+
+class Fox(NewsSource):
+
+ name = 'Fox News'
+ shortname = 'Fox'
+ url = 'http://www.foxnews.com'
+
+ bad_titles = ['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
+ bad_descriptions = ['Sean Hannity']
+ bad_authors = ['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz']
+ bad_imgs = ['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
+ bad_urls = ['http://www.foxnews.com/opinion', 'videos.foxnews.com']
+
+ @classmethod
+ def _fetch_urls(cls):
+ """
+ Returns three tuples of urls, one for each of
+ the three tiers.
+ """
+ soup = cls._fetch_content(cls.url)
+
+ # get primary headline
+ h1 = soup.find('div', id='big-top')\
+ .find('div', class_='primary')\
+ .find('h1')\
+ .find('a')['href']
+ h1s = (h1,)
+
+ # get secondary headlines
+ h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
+ h2s = tuple(x['href'] for x in h2s)
+
+ # get tertiary headlines
+ h3s = []
+ for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
+ for li in ul.find_all('li', recursive=False):
+ h3s.append(li.find('a')['href'])
+ h3s = tuple(h3s)
+
+ return h1s, h2s, h3s
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 6ec89b7..7825d93 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -11,8 +11,6 @@ import urllib.parse
from PIL import Image
import requests
-from unbiased.unbiasedObjects import *
-
logger = logging.getLogger('unbiased')
#take in a url and delimiters, return twitter card
@@ -141,69 +139,30 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd,
return None
-def pickStories(newsSourceArr):
- # TODO: refactor to avoid infinite loops
- #set the random order for sources
- h1RandomSources=[]
- guard = 0
- while len(h1RandomSources)<4:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h1Arr)>0:
- if x not in h1RandomSources:
- h1RandomSources.append(x)
- else:
- logger.debug('No H1 stories in '+newsSourceArr[x].name)
- guard += 1
- if guard > 100:
- return [], [], []
-
- #For h2s and h3s, select N random sources (can repeat), then
- #a non-repetitive random article from within
- h2RandomPairs=[]
- while len(h2RandomPairs) < 6:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h2Arr) > 0:
- y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
- pair=[x,y]
- if not pair in h2RandomPairs:
- h2RandomPairs.append(pair)
- else:
- logger.debug('No H2 stories in '+newsSourceArr[x].name)
-
- h3RandomPairs=[]
- while len(h3RandomPairs) < 12:
- x=random.sample(range(len(newsSourceArr)), 1)[0]
- if len(newsSourceArr[x].h3Arr) > 0:
- y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
- pair=[x,y]
- if not pair in h3RandomPairs:
- h3RandomPairs.append(pair)
- else:
- logger.debug('No H3 stories in '+newsSourceArr[x].name)
+def pick_randoms(story_lists, length, per_source):
+ """
+ Return a randomly chosen list of 'length' stories, picking at
+ most 'per_source' stories from each source.
+ """
+ # TODO: weighting is incorrect if a source has fewer than 'per_source' articles
+ urandom = random.SystemRandom()
+ candidates = []
+ for stories in story_lists:
+ indexes = list(range(len(stories)))
+ urandom.shuffle(indexes)
+ random_indexes = indexes[:per_source]
+ candidates.extend([stories[x] for x in random_indexes])
+ indexes = list(range(len(candidates)))
+ urandom.shuffle(indexes)
+ random_indexes = indexes[:length]
+ return tuple(candidates[x] for x in random_indexes)
- # collect articles for each section
- image_index = 0
- top_stories = []
- for i in range(len(h1RandomSources)):
- source=newsSourceArr[h1RandomSources[i]]
- randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
- article=source.h1Arr[randomArticle]
- top_stories.append(article)
-
- middle_stories = []
- for i in range(len(h2RandomPairs)):
- pair=h2RandomPairs[i]
- article=newsSourceArr[pair[0]].h2Arr[pair[1]]
- middle_stories.append(article)
-
- bottom_stories = []
- for i in range(len(h3RandomPairs)):
- pair=h3RandomPairs[i]
- article=newsSourceArr[pair[0]].h3Arr[pair[1]]
- bottom_stories.append(article)
-
- return top_stories, middle_stories, bottom_stories
+def pickStories(newsSourceArr):
+ h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1)
+ h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2)
+ h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2)
+ return h1s, h2s, h3s
def buildOutput(top_stories, middle_stories, bottom_stories):
#read in the template html file
@@ -270,6 +229,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200):
# resize if larger
if target_width * 2 < width or target_height * 2 < height:
img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
+ # TODO: fill with a neutral color instead of just discarding alpha channel
+ img = img.convert('RGB')
# TODO: create retina images
jpg_name = 'img{}.jpg'.format(index)
out_file = os.path.join(webroot, jpg_name)