From 8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 16:47:30 -0400 Subject: don't need a scratch directory any more --- unbiased/main.py | 10 ++--- unbiased/parser.py | 91 +++++++++++++++++++++---------------------- unbiased/unbiasedFunctions.py | 11 +++--- 3 files changed, 54 insertions(+), 58 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index c760788..60211ea 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -19,14 +19,13 @@ logger.addHandler(ch) def main(): parser = argparse.ArgumentParser() parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') - parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace') args = parser.parse_args() crawl_frequency = 600 while True: logger.info('Starting crawl') start = time.time() - run(args.webroot, args.scratch) + run(args.webroot) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -34,7 +33,7 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot, scratch): +def run(webroot): sourceList=[] ''' @@ -47,7 +46,6 @@ def run(webroot, scratch): ''' logger.debug('Running with webroot="{}"'.format(webroot)) - logger.debug('Running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -65,7 +63,7 @@ def run(webroot, scratch): possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method(scratch) + src=method() sourceList.append(src) break except Exception as ex: @@ -79,7 +77,7 @@ def run(webroot, scratch): newsSourceArr = sourceList #build the output file HTML - outputHTML=buildOutput(newsSourceArr, webroot, scratch) + outputHTML=buildOutput(newsSourceArr, webroot) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/parser.py b/unbiased/parser.py index 0a8398c..41727f5 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -3,7 +3,6 @@ import logging import os import re -import subprocess import urllib.parse import requests @@ -18,7 +17,7 @@ logger = logging.getLogger('unbiased') Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' -def urlToContent(url, scratchDir, sourceEncoding='utf8'): +def urlToContent(url, sourceEncoding='utf8'): res = requests.get(url) if res.status_code == 200: return res.text @@ -31,7 +30,7 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): url_parts = urllib.parse.urlparse(url) scheme = url_parts.scheme @@ -40,7 +39,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] h1Arr=[] - a=buildArticle(h1URLs[0], name, scratchDir) + a=buildArticle(h1URLs[0], name) if a==None: logger.debug('H1 Nonetype in '+name) else: @@ -48,7 +47,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h2Arr=[] for x in h2URLs: - a=buildArticle(x, name, scratchDir) + a=buildArticle(x, name) if a!=None: h2Arr.append(a) else: @@ -56,7 +55,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3Arr=[] for x in h3URLs: - a=buildArticle(x, name, scratchDir) + a=buildArticle(x, name) if a!=None: h3Arr.append(a) else: @@ -161,12 +160,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b -def buildTheHill(scratchDir): +def buildTheHill(): url='http://thehill.com' name='The Hill' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -198,7 +197,7 @@ def buildTheHill(scratchDir): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + hil=buildNewsSource2(name, url, h1s, h2s, h3s) hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None) return hil @@ -207,14 +206,14 @@ def buildTheHill(scratchDir): -def buildGuardian(scratchDir): +def buildGuardian(): url='http://www.theguardian.com/us' name='The Guardian US' while True: #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir, 'utf8') + content=urlToContent(url, 'utf8') #get main headline h1=content @@ -256,20 +255,20 @@ def buildGuardian(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + gdn=buildNewsSource2(name, url, h1s, h2s, h3s) gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None) return gdn -def buildWashTimes(scratchDir): +def buildWashTimes(): url='http://www.washingtontimes.com/' name='Washington Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -305,19 +304,19 @@ def buildWashTimes(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + wat=buildNewsSource2(name, url, h1s, h2s, h3s) wat=removeBadStories(wat, None, None, None, None) return wat -def buildCSM(scratchDir): +def buildCSM(): url='http://www.csmonitor.com/USA' name='Christian Science Monitor' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #this makes sure we don't get '/USA' in the URL twice url=url.split('/USA')[0] @@ -368,7 +367,7 @@ def buildCSM(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + csm=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=['Change Agent'] badDescArr=None @@ -388,7 +387,7 @@ in The Blaze articles by grabbing the first portion of the story instead def blazeFixDesc(articleArr): TAG_RE = re.compile(r'<[^>]+>') for i in range(len(articleArr)): - desc=urlToContent(articleArr[i].url, scratchDir) + desc=urlToContent(articleArr[i].url) desc=desc.split('
', 1)[1] desc=desc.split('

', 1)[1] desc=TAG_RE.sub('', desc) @@ -400,12 +399,12 @@ def blazeFixDesc(articleArr): -def buildBlaze(scratchDir): +def buildBlaze(): url='http://theblaze.com' name='The Blaze' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -439,7 +438,7 @@ def buildBlaze(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + blz=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=['Tucker Carlson', 'Mark Levin'] badDescArr=['Lawrence Jones', 'Mike Slater'] @@ -459,12 +458,12 @@ def buildBlaze(scratchDir): -def buildCBS(scratchDir): +def buildCBS(): url='http://cbsnews.com' name='CBS News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -508,7 +507,7 @@ def buildCBS(scratchDir): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + cbs=buildNewsSource2(name, url, h1s, h2s, h3s) cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart']) return cbs @@ -517,12 +516,12 @@ def buildCBS(scratchDir): -def buildNBC(scratchDir): +def buildNBC(): url='http://nbcnews.com' name='NBC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -571,7 +570,7 @@ def buildNBC(scratchDir): ''' h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + nbc=buildNewsSource2(name, url, h1s, h2s, h3s) nbc=removeBadStories(nbc, None, ['First Read'], None, None, None) @@ -580,12 +579,12 @@ def buildNBC(scratchDir): -def buildBBC(scratchDir): +def buildBBC(): url='http://www.bbc.com/news/world/us_and_canada' name='BBC US & Canada' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -619,7 +618,7 @@ def buildBBC(scratchDir): h3s.append('http://www.bbc.com'+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + bbc=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=None badDescArr=None badAuthorArr=None @@ -642,12 +641,12 @@ def buildBBC(scratchDir): -def buildWeeklyStandard(scratchDir): +def buildWeeklyStandard(): url='http://www.weeklystandard.com' name='Weekly Standard' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -692,7 +691,7 @@ def buildWeeklyStandard(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=None @@ -707,12 +706,12 @@ def buildWeeklyStandard(scratchDir): -def buildNPR(scratchDir): +def buildNPR(): url='http://www.npr.org/sections/news/' name='NPR' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -746,7 +745,7 @@ def buildNPR(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + npr=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=['The Two-Way'] @@ -761,12 +760,12 @@ def buildNPR(scratchDir): -def buildABC(scratchDir): +def buildABC(): url='http://www.abcnews.go.com' name='ABC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -800,7 +799,7 @@ def buildABC(scratchDir): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + abc=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=None @@ -815,12 +814,12 @@ def buildABC(scratchDir): -def buildFoxNews(scratchDir): +def buildFoxNews(): url='http://foxnews.com' name='Fox News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -854,7 +853,7 @@ def buildFoxNews(scratchDir): h3s = ['http:' + x if x.startswith('//') else x for x in h3s] h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + fox=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] @@ -868,12 +867,12 @@ def buildFoxNews(scratchDir): -def buildNYT(scratchDir): +def buildNYT(): url='http://www.nytimes.com' name='New York Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline #this will likely need if/else logic @@ -951,7 +950,7 @@ def buildNYT(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + nyt=buildNewsSource2(name, url, h1s, h2s, h3s) nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter']) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 0181beb..76c80b0 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -5,7 +5,6 @@ import os import pkgutil import random import re -import subprocess import time import urllib.parse @@ -17,7 +16,7 @@ from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased') #take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): +def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): debugging=False if debugging: @@ -142,7 +141,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None -def buildOutput(newsSourceArr, webroot, scratch): +def buildOutput(newsSourceArr, webroot): #read in the template html file from jinja2 import Environment, PackageLoader, select_autoescape env = Environment( @@ -193,7 +192,7 @@ def buildOutput(newsSourceArr, webroot, scratch): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200) + img_name = pullImage(article.img, image_index, webroot, 350, 200) image_index += 1 article.img = img_name top_stories.append(article) @@ -202,7 +201,7 @@ def buildOutput(newsSourceArr, webroot, scratch): for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100) + img_name = pullImage(article.img, image_index, webroot, 150, 100) image_index += 1 article.img = img_name middle_stories.append(article) @@ -246,7 +245,7 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp: fp.write(data) -def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): +def pullImage(url, index, webroot, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) res = requests.get(url) -- cgit v1.2.3