summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <matt@xcolour.net>2017-04-17 13:44:46 -0400
committerMatt Singleton <matt@xcolour.net>2017-04-17 13:44:46 -0400
commit6a0a5579ea9b3674f011eabd2a4c339100a66ba8 (patch)
treef4c994c7843f094a0cd0187a8fec51558c75d692
parent5b0c9c5daa36878513bcc5edbe87a5fe52fdbb82 (diff)
read the scratch dir path on the command line
-rwxr-xr-xunbiased/main.py7
-rwxr-xr-xunbiased/parser.py100
-rw-r--r--unbiased/unbiasedFunctions.py29
3 files changed, 74 insertions, 62 deletions
diff --git a/unbiased/main.py b/unbiased/main.py
index b8bd4cb..159a98b 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -49,16 +49,17 @@ def run(webroot, scratch):
possibles = globals().copy()
possibles.update(locals())
method = possibles.get(fn)
- src=method()
+ src=method(scratch)
sourceList.append(src)
break
- except Exception:
+ except Exception as ex:
+ print(ex)
print('Build error. Looping again: '+source)
tries+=1
time.sleep(tries)
#scrape all urls and build data structure
- newsSourceArr=buildNewsSourceArr(sourceList)
+ newsSourceArr=buildNewsSourceArr(sourceList, scratch)
#build the output file HTML
outputHTML=buildOutput(newsSourceArr)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index 1f9bc5c..ea2a187 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -2,6 +2,7 @@
import os
import re
+import subprocess
from unbiased.unbiasedObjects import *
from unbiased.unbiasedFunctions import buildArticle
@@ -11,15 +12,18 @@ from unbiased.unbiasedFunctions import buildArticle
Takes in a URL, downloads the file to a temp file,
reads the file into a string, and returns that string
'''
-def urlToContent(url, sourceEncoding='utf8'):
+def urlToContent(url, scratchDir, sourceEncoding='utf8'):
+ temp_file = os.path.join(scratchDir, 'temp1.html')
+
#download file
- os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+ #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+ subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
#read file
if sourceEncoding=='utf8':
- f=open('scratch/temp1.html', 'r', encoding="utf8")
+ f=open(temp_file, 'r', encoding="utf8")
else:
- f=open('scratch/temp1.html', 'r', encoding="latin-1")
+ f=open(temp_file, 'r', encoding="latin-1")
content=f.read()
f.close()
@@ -31,9 +35,9 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs,
calls the file scraper and appends the new Article object.
Returns a newsSource2 object
'''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
h1Arr=[]
- a=buildArticle(h1URLs[0], name)
+ a=buildArticle(h1URLs[0], name, scratchDir)
if a==None:
print('................\nH1 Nonetype in '+name+'\n................')
else:
@@ -41,7 +45,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
h2Arr=[]
for x in h2URLs:
- a=buildArticle(x, name)
+ a=buildArticle(x, name, scratchDir)
if a!=None:
h2Arr.append(a)
else:
@@ -50,7 +54,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
h3Arr=[]
for x in h3URLs:
- a=buildArticle(x, name)
+ a=buildArticle(x, name, scratchDir)
if a!=None:
h3Arr.append(a)
else:
@@ -157,12 +161,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b
-def buildTheHill():
+def buildTheHill(scratchDir):
url='http://thehill.com'
name='The Hill'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -194,7 +198,7 @@ def buildTheHill():
h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- hil=buildNewsSource2(name, url, h1s, h2s, h3s)
+ hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
return hil
@@ -203,14 +207,14 @@ def buildTheHill():
-def buildGuardian():
+def buildGuardian(scratchDir):
url='http://www.theguardian.com/us'
name='The Guardian US'
while True:
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, 'utf8')
+ content=urlToContent(url, scratchDir, 'utf8')
#get main headline
h1=content
@@ -252,20 +256,20 @@ def buildGuardian():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
+ gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
-def buildWashTimes():
+def buildWashTimes(scratchDir):
url='http://www.washingtontimes.com/'
name='Washington Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -301,19 +305,19 @@ def buildWashTimes():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wat=buildNewsSource2(name, url, h1s, h2s, h3s)
+ wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
wat=removeBadStories(wat, None, None, None, None)
return wat
-def buildCSM():
+def buildCSM(scratchDir):
url='http://www.csmonitor.com/USA'
name='Christian Science Monitor'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#this makes sure we don't get '/USA' in the URL twice
url=url.split('/USA')[0]
@@ -364,7 +368,7 @@ def buildCSM():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- csm=buildNewsSource2(name, url, h1s, h2s, h3s)
+ csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
badTitleArr=['Change Agent']
badDescArr=None
@@ -384,7 +388,7 @@ in The Blaze articles by grabbing the first portion of the story instead
def blazeFixDesc(articleArr):
TAG_RE = re.compile(r'<[^>]+>')
for i in range(len(articleArr)):
- desc=urlToContent(articleArr[i].url)
+ desc=urlToContent(articleArr[i].url, scratchDir)
desc=desc.split('<div class="entry-content article-styles">', 1)[1]
desc=desc.split('<p>', 1)[1]
desc=TAG_RE.sub('', desc)
@@ -396,12 +400,12 @@ def blazeFixDesc(articleArr):
-def buildBlaze():
+def buildBlaze(scratchDir):
url='http://theblaze.com'
name='The Blaze'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -435,7 +439,7 @@ def buildBlaze():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- blz=buildNewsSource2(name, url, h1s, h2s, h3s)
+ blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
badTitleArr=['Tucker Carlson', 'Mark Levin']
badDescArr=['Lawrence Jones', 'Mike Slater']
@@ -455,12 +459,12 @@ def buildBlaze():
-def buildCBS():
+def buildCBS(scratchDir):
url='http://cbsnews.com'
name='CBS News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -504,7 +508,7 @@ def buildCBS():
h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
+ cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
return cbs
@@ -513,12 +517,12 @@ def buildCBS():
-def buildNBC():
+def buildNBC(scratchDir):
url='http://nbcnews.com'
name='NBC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -567,7 +571,7 @@ def buildNBC():
'''
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+ nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
@@ -576,12 +580,12 @@ def buildNBC():
-def buildBBC():
+def buildBBC(scratchDir):
url='http://www.bbc.com/news/world/us_and_canada'
name='BBC US & Canada'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -615,7 +619,7 @@ def buildBBC():
h3s.append('http://www.bbc.com'+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+ bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
badTitleArr=None
badDescArr=None
badAuthorArr=None
@@ -638,12 +642,12 @@ def buildBBC():
-def buildWeeklyStandard():
+def buildWeeklyStandard(scratchDir):
url='http://www.weeklystandard.com'
name='Weekly Standard'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -688,7 +692,7 @@ def buildWeeklyStandard():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
+ wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
#REMOVE BAD STORIES
badTitleArr=None
@@ -703,12 +707,12 @@ def buildWeeklyStandard():
-def buildNPR():
+def buildNPR(scratchDir):
url='http://www.npr.org/sections/news/'
name='NPR'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -742,7 +746,7 @@ def buildNPR():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- npr=buildNewsSource2(name, url, h1s, h2s, h3s)
+ npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
#REMOVE BAD STORIES
badTitleArr=['The Two-Way']
@@ -757,12 +761,12 @@ def buildNPR():
-def buildABC():
+def buildABC(scratchDir):
url='http://www.abcnews.go.com'
name='ABC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -796,7 +800,7 @@ def buildABC():
h3s.append(x)
h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
- abc=buildNewsSource2(name, url, h1s, h2s, h3s)
+ abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
#REMOVE BAD STORIES
badTitleArr=None
@@ -811,12 +815,12 @@ def buildABC():
-def buildFoxNews():
+def buildFoxNews(scratchDir):
url='http://foxnews.com'
name='Fox News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
h1=content
@@ -847,7 +851,7 @@ def buildFoxNews():
h3s.append(x)
h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
- fox=buildNewsSource2(name, url, h1s, h2s, h3s)
+ fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
#REMOVE BAD STORIES
badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
@@ -861,12 +865,12 @@ def buildFoxNews():
-def buildNYT():
+def buildNYT(scratchDir):
url='http://www.nytimes.com'
name='New York Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
+ content=urlToContent(url, scratchDir)
#get main headline
#this will likely need if/else logic
@@ -944,7 +948,7 @@ def buildNYT():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
+ nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 3d3363b..6210ba8 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -2,25 +2,29 @@ import os
import pkgutil
import random
import re
+import subprocess
import time
from unbiased.unbiasedObjects import *
#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
print(sourceName)
print(url)
print()
-
+
+ temp_article = os.path.join(scratchDir, 'temp_article.html')
+
#download url
- os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
+ #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
+ subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url])
#read the file in
- f=open('scratch/temp_article.html', 'r', encoding="utf8")
+ f=open(temp_article, 'r', encoding="utf8")
content=f.read()
f.close()
@@ -215,7 +219,7 @@ def printOutputHTML(outputHTML, outDir):
with open(os.path.join(outDir, 'unbiased.css'), 'w') as fp:
fp.write(css)
-def buildNewsSourceArr(sourceList):
+def buildNewsSourceArr(sourceList, scratchDir):
#build the data structure
i=0
@@ -229,16 +233,19 @@ def buildNewsSourceArr(sourceList):
url=source.url
+ temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i))
+
#download file
- os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
+ #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
+ subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
#read file
- f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8")
+ f=open(temp_file, 'r', encoding="utf8")
content=f.read()
f.close()
#delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
- #os.remove('scratch/temp'+str(i)+'.html')
+ #os.remove(temp_file)
#add stories etc to the NewsSource object
h1s, h2s, h3s=extractURLs(content, source)
@@ -246,13 +253,13 @@ def buildNewsSourceArr(sourceList):
#build the Article objects and add to newsSource's appropriate list
if h1s!=None and h2s!=None:
for url in h1s:
- article=buildArticle(url, source.name)
+ article=buildArticle(url, source.name, scratchDir)
if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
for url in h2s:
- article=buildArticle(url, source.name)
+ article=buildArticle(url, source.name, scratchDir)
if article!=None: sourceList[i].h2Arr.append(article)
for url in h3s:
- article=buildArticle(url, source.name)
+ article=buildArticle(url, source.name, scratchDir)
if article!=None: sourceList[i].h3Arr.append(article)
i+=1
else: