From 6a0a5579ea9b3674f011eabd2a4c339100a66ba8 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 13:44:46 -0400 Subject: read the scratch dir path on the command line --- unbiased/main.py | 7 +-- unbiased/parser.py | 100 ++++++++++++++++++++++-------------------- unbiased/unbiasedFunctions.py | 29 +++++++----- 3 files changed, 74 insertions(+), 62 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index b8bd4cb..159a98b 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -49,16 +49,17 @@ def run(webroot, scratch): possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method() + src=method(scratch) sourceList.append(src) break - except Exception: + except Exception as ex: + print(ex) print('Build error. Looping again: '+source) tries+=1 time.sleep(tries) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList) + newsSourceArr=buildNewsSourceArr(sourceList, scratch) #build the output file HTML outputHTML=buildOutput(newsSourceArr) diff --git a/unbiased/parser.py b/unbiased/parser.py index 1f9bc5c..ea2a187 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -2,6 +2,7 @@ import os import re +import subprocess from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle @@ -11,15 +12,18 @@ from unbiased.unbiasedFunctions import buildArticle Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' -def urlToContent(url, sourceEncoding='utf8'): +def urlToContent(url, scratchDir, sourceEncoding='utf8'): + temp_file = os.path.join(scratchDir, 'temp1.html') + #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) #read file if sourceEncoding=='utf8': - f=open('scratch/temp1.html', 'r', encoding="utf8") + f=open(temp_file, 'r', encoding="utf8") else: - f=open('scratch/temp1.html', 'r', encoding="latin-1") + f=open(temp_file, 'r', encoding="latin-1") content=f.read() f.close() @@ -31,9 +35,9 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h1Arr=[] - a=buildArticle(h1URLs[0], name) + a=buildArticle(h1URLs[0], name, scratchDir) if a==None: print('................\nH1 Nonetype in '+name+'\n................') else: @@ -41,7 +45,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h2Arr=[] for x in h2URLs: - a=buildArticle(x, name) + a=buildArticle(x, name, scratchDir) if a!=None: h2Arr.append(a) else: @@ -50,7 +54,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h3Arr=[] for x in h3URLs: - a=buildArticle(x, name) + a=buildArticle(x, name, scratchDir) if a!=None: h3Arr.append(a) else: @@ -157,12 +161,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b -def buildTheHill(): +def buildTheHill(scratchDir): url='http://thehill.com' name='The Hill' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -194,7 +198,7 @@ def buildTheHill(): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - hil=buildNewsSource2(name, url, h1s, h2s, h3s) + hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None) return hil @@ -203,14 +207,14 @@ def buildTheHill(): -def buildGuardian(): +def buildGuardian(scratchDir): url='http://www.theguardian.com/us' name='The Guardian US' while True: #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, 'utf8') + content=urlToContent(url, scratchDir, 'utf8') #get main headline h1=content @@ -252,20 +256,20 @@ def buildGuardian(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - gdn=buildNewsSource2(name, url, h1s, h2s, h3s) + gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None) return gdn -def buildWashTimes(): +def buildWashTimes(scratchDir): url='http://www.washingtontimes.com/' name='Washington Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -301,19 +305,19 @@ def buildWashTimes(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wat=buildNewsSource2(name, url, h1s, h2s, h3s) + wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) wat=removeBadStories(wat, None, None, None, None) return wat -def buildCSM(): +def buildCSM(scratchDir): url='http://www.csmonitor.com/USA' name='Christian Science Monitor' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #this makes sure we don't get '/USA' in the URL twice url=url.split('/USA')[0] @@ -364,7 +368,7 @@ def buildCSM(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - csm=buildNewsSource2(name, url, h1s, h2s, h3s) + csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=['Change Agent'] badDescArr=None @@ -384,7 +388,7 @@ in The Blaze articles by grabbing the first portion of the story instead def blazeFixDesc(articleArr): TAG_RE = re.compile(r'<[^>]+>') for i in range(len(articleArr)): - desc=urlToContent(articleArr[i].url) + desc=urlToContent(articleArr[i].url, scratchDir) desc=desc.split('
', 1)[1] desc=desc.split('

', 1)[1] desc=TAG_RE.sub('', desc) @@ -396,12 +400,12 @@ def blazeFixDesc(articleArr): -def buildBlaze(): +def buildBlaze(scratchDir): url='http://theblaze.com' name='The Blaze' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -435,7 +439,7 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=['Tucker Carlson', 'Mark Levin'] badDescArr=['Lawrence Jones', 'Mike Slater'] @@ -455,12 +459,12 @@ def buildBlaze(): -def buildCBS(): +def buildCBS(scratchDir): url='http://cbsnews.com' name='CBS News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -504,7 +508,7 @@ def buildCBS(): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - cbs=buildNewsSource2(name, url, h1s, h2s, h3s) + cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart']) return cbs @@ -513,12 +517,12 @@ def buildCBS(): -def buildNBC(): +def buildNBC(scratchDir): url='http://nbcnews.com' name='NBC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -567,7 +571,7 @@ def buildNBC(): ''' h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nbc=buildNewsSource2(name, url, h1s, h2s, h3s) + nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) nbc=removeBadStories(nbc, None, ['First Read'], None, None, None) @@ -576,12 +580,12 @@ def buildNBC(): -def buildBBC(): +def buildBBC(scratchDir): url='http://www.bbc.com/news/world/us_and_canada' name='BBC US & Canada' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -615,7 +619,7 @@ def buildBBC(): h3s.append('http://www.bbc.com'+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - bbc=buildNewsSource2(name, url, h1s, h2s, h3s) + bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=None badDescArr=None badAuthorArr=None @@ -638,12 +642,12 @@ def buildBBC(): -def buildWeeklyStandard(): +def buildWeeklyStandard(scratchDir): url='http://www.weeklystandard.com' name='Weekly Standard' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -688,7 +692,7 @@ def buildWeeklyStandard(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=None @@ -703,12 +707,12 @@ def buildWeeklyStandard(): -def buildNPR(): +def buildNPR(scratchDir): url='http://www.npr.org/sections/news/' name='NPR' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -742,7 +746,7 @@ def buildNPR(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - npr=buildNewsSource2(name, url, h1s, h2s, h3s) + npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=['The Two-Way'] @@ -757,12 +761,12 @@ def buildNPR(): -def buildABC(): +def buildABC(scratchDir): url='http://www.abcnews.go.com' name='ABC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -796,7 +800,7 @@ def buildABC(): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - abc=buildNewsSource2(name, url, h1s, h2s, h3s) + abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=None @@ -811,12 +815,12 @@ def buildABC(): -def buildFoxNews(): +def buildFoxNews(scratchDir): url='http://foxnews.com' name='Fox News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -847,7 +851,7 @@ def buildFoxNews(): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s) + fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] @@ -861,12 +865,12 @@ def buildFoxNews(): -def buildNYT(): +def buildNYT(scratchDir): url='http://www.nytimes.com' name='New York Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline #this will likely need if/else logic @@ -944,7 +948,7 @@ def buildNYT(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nyt=buildNewsSource2(name, url, h1s, h2s, h3s) + nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter']) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 3d3363b..6210ba8 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -2,25 +2,29 @@ import os import pkgutil import random import re +import subprocess import time from unbiased.unbiasedObjects import * #take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): +def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): debugging=False if debugging: print(sourceName) print(url) print() - + + temp_article = os.path.join(scratchDir, 'temp_article.html') + #download url - os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url]) #read the file in - f=open('scratch/temp_article.html', 'r', encoding="utf8") + f=open(temp_article, 'r', encoding="utf8") content=f.read() f.close() @@ -215,7 +219,7 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, 'unbiased.css'), 'w') as fp: fp.write(css) -def buildNewsSourceArr(sourceList): +def buildNewsSourceArr(sourceList, scratchDir): #build the data structure i=0 @@ -229,16 +233,19 @@ def buildNewsSourceArr(sourceList): url=source.url + temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i)) + #download file - os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) #read file - f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") + f=open(temp_file, 'r', encoding="utf8") content=f.read() f.close() #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove('scratch/temp'+str(i)+'.html') + #os.remove(temp_file) #add stories etc to the NewsSource object h1s, h2s, h3s=extractURLs(content, source) @@ -246,13 +253,13 @@ def buildNewsSourceArr(sourceList): #build the Article objects and add to newsSource's appropriate list if h1s!=None and h2s!=None: for url in h1s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) for url in h2s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: sourceList[i].h2Arr.append(article) for url in h3s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: sourceList[i].h3Arr.append(article) i+=1 else: -- cgit v1.2.3