summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <matt@xcolour.net>2017-04-19 16:47:30 -0400
committerMatt Singleton <matt@xcolour.net>2017-04-19 16:47:30 -0400
commit8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 (patch)
treef7188ac5c7a4fb2a36f74c52699d03dfe5a783ce
parent7a8efb94dc2463a6d30afc77f10df78ebfa4c353 (diff)
don't need a scratch directory any more
-rwxr-xr-xunbiased/main.py10
-rwxr-xr-xunbiased/parser.py91
-rw-r--r--unbiased/unbiasedFunctions.py11
3 files changed, 54 insertions, 58 deletions
diff --git a/unbiased/main.py b/unbiased/main.py
index c760788..60211ea 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -19,14 +19,13 @@ logger.addHandler(ch)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html')
- parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace')
args = parser.parse_args()
crawl_frequency = 600
while True:
logger.info('Starting crawl')
start = time.time()
- run(args.webroot, args.scratch)
+ run(args.webroot)
finish = time.time()
runtime = finish - start
sleeptime = crawl_frequency - runtime
@@ -34,7 +33,7 @@ def main():
if sleeptime > 0:
time.sleep(sleeptime)
-def run(webroot, scratch):
+def run(webroot):
sourceList=[]
'''
@@ -47,7 +46,6 @@ def run(webroot, scratch):
'''
logger.debug('Running with webroot="{}"'.format(webroot))
- logger.debug('Running with scratch="{}"'.format(scratch))
### These values have to be the second half of the function name
@@ -65,7 +63,7 @@ def run(webroot, scratch):
possibles = globals().copy()
possibles.update(locals())
method = possibles.get(fn)
- src=method(scratch)
+ src=method()
sourceList.append(src)
break
except Exception as ex:
@@ -79,7 +77,7 @@ def run(webroot, scratch):
newsSourceArr = sourceList
#build the output file HTML
- outputHTML=buildOutput(newsSourceArr, webroot, scratch)
+ outputHTML=buildOutput(newsSourceArr, webroot)
#print the output file HTML
printOutputHTML(outputHTML, webroot)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index 0a8398c..41727f5 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -3,7 +3,6 @@
import logging
import os
import re
-import subprocess
import urllib.parse
import requests
@@ -18,7 +17,7 @@ logger = logging.getLogger('unbiased')
Takes in a URL, downloads the file to a temp file,
reads the file into a string, and returns that string
'''
-def urlToContent(url, scratchDir, sourceEncoding='utf8'):
+def urlToContent(url, sourceEncoding='utf8'):
res = requests.get(url)
if res.status_code == 200:
return res.text
@@ -31,7 +30,7 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs,
calls the file scraper and appends the new Article object.
Returns a newsSource2 object
'''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
url_parts = urllib.parse.urlparse(url)
scheme = url_parts.scheme
@@ -40,7 +39,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
h1Arr=[]
- a=buildArticle(h1URLs[0], name, scratchDir)
+ a=buildArticle(h1URLs[0], name)
if a==None:
logger.debug('H1 Nonetype in '+name)
else:
@@ -48,7 +47,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
h2Arr=[]
for x in h2URLs:
- a=buildArticle(x, name, scratchDir)
+ a=buildArticle(x, name)
if a!=None:
h2Arr.append(a)
else:
@@ -56,7 +55,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
h3Arr=[]
for x in h3URLs:
- a=buildArticle(x, name, scratchDir)
+ a=buildArticle(x, name)
if a!=None:
h3Arr.append(a)
else:
@@ -161,12 +160,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b
-def buildTheHill(scratchDir):
+def buildTheHill():
url='http://thehill.com'
name='The Hill'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -198,7 +197,7 @@ def buildTheHill(scratchDir):
h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ hil=buildNewsSource2(name, url, h1s, h2s, h3s)
hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
return hil
@@ -207,14 +206,14 @@ def buildTheHill(scratchDir):
-def buildGuardian(scratchDir):
+def buildGuardian():
url='http://www.theguardian.com/us'
name='The Guardian US'
while True:
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir, 'utf8')
+ content=urlToContent(url, 'utf8')
#get main headline
h1=content
@@ -256,20 +255,20 @@ def buildGuardian(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
-def buildWashTimes(scratchDir):
+def buildWashTimes():
url='http://www.washingtontimes.com/'
name='Washington Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -305,19 +304,19 @@ def buildWashTimes(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ wat=buildNewsSource2(name, url, h1s, h2s, h3s)
wat=removeBadStories(wat, None, None, None, None)
return wat
-def buildCSM(scratchDir):
+def buildCSM():
url='http://www.csmonitor.com/USA'
name='Christian Science Monitor'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#this makes sure we don't get '/USA' in the URL twice
url=url.split('/USA')[0]
@@ -368,7 +367,7 @@ def buildCSM(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ csm=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Change Agent']
badDescArr=None
@@ -388,7 +387,7 @@ in The Blaze articles by grabbing the first portion of the story instead
def blazeFixDesc(articleArr):
TAG_RE = re.compile(r'<[^>]+>')
for i in range(len(articleArr)):
- desc=urlToContent(articleArr[i].url, scratchDir)
+ desc=urlToContent(articleArr[i].url)
desc=desc.split('<div class="entry-content article-styles">', 1)[1]
desc=desc.split('<p>', 1)[1]
desc=TAG_RE.sub('', desc)
@@ -400,12 +399,12 @@ def blazeFixDesc(articleArr):
-def buildBlaze(scratchDir):
+def buildBlaze():
url='http://theblaze.com'
name='The Blaze'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -439,7 +438,7 @@ def buildBlaze(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ blz=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Tucker Carlson', 'Mark Levin']
badDescArr=['Lawrence Jones', 'Mike Slater']
@@ -459,12 +458,12 @@ def buildBlaze(scratchDir):
-def buildCBS(scratchDir):
+def buildCBS():
url='http://cbsnews.com'
name='CBS News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -508,7 +507,7 @@ def buildCBS(scratchDir):
h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
return cbs
@@ -517,12 +516,12 @@ def buildCBS(scratchDir):
-def buildNBC(scratchDir):
+def buildNBC():
url='http://nbcnews.com'
name='NBC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -571,7 +570,7 @@ def buildNBC(scratchDir):
'''
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
@@ -580,12 +579,12 @@ def buildNBC(scratchDir):
-def buildBBC(scratchDir):
+def buildBBC():
url='http://www.bbc.com/news/world/us_and_canada'
name='BBC US & Canada'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -619,7 +618,7 @@ def buildBBC(scratchDir):
h3s.append('http://www.bbc.com'+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=None
badDescArr=None
badAuthorArr=None
@@ -642,12 +641,12 @@ def buildBBC(scratchDir):
-def buildWeeklyStandard(scratchDir):
+def buildWeeklyStandard():
url='http://www.weeklystandard.com'
name='Weekly Standard'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -692,7 +691,7 @@ def buildWeeklyStandard(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=None
@@ -707,12 +706,12 @@ def buildWeeklyStandard(scratchDir):
-def buildNPR(scratchDir):
+def buildNPR():
url='http://www.npr.org/sections/news/'
name='NPR'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -746,7 +745,7 @@ def buildNPR(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ npr=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=['The Two-Way']
@@ -761,12 +760,12 @@ def buildNPR(scratchDir):
-def buildABC(scratchDir):
+def buildABC():
url='http://www.abcnews.go.com'
name='ABC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -800,7 +799,7 @@ def buildABC(scratchDir):
h3s.append(x)
h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
- abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ abc=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=None
@@ -815,12 +814,12 @@ def buildABC(scratchDir):
-def buildFoxNews(scratchDir):
+def buildFoxNews():
url='http://foxnews.com'
name='Fox News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -854,7 +853,7 @@ def buildFoxNews(scratchDir):
h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
@@ -868,12 +867,12 @@ def buildFoxNews(scratchDir):
-def buildNYT(scratchDir):
+def buildNYT():
url='http://www.nytimes.com'
name='New York Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
#this will likely need if/else logic
@@ -951,7 +950,7 @@ def buildNYT(scratchDir):
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 0181beb..76c80b0 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -5,7 +5,6 @@ import os
import pkgutil
import random
import re
-import subprocess
import time
import urllib.parse
@@ -17,7 +16,7 @@ from unbiased.unbiasedObjects import *
logger = logging.getLogger('unbiased')
#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
@@ -142,7 +141,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
return None
-def buildOutput(newsSourceArr, webroot, scratch):
+def buildOutput(newsSourceArr, webroot):
#read in the template html file
from jinja2 import Environment, PackageLoader, select_autoescape
env = Environment(
@@ -193,7 +192,7 @@ def buildOutput(newsSourceArr, webroot, scratch):
source=newsSourceArr[h1RandomSources[i]]
randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
article=source.h1Arr[randomArticle]
- img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200)
+ img_name = pullImage(article.img, image_index, webroot, 350, 200)
image_index += 1
article.img = img_name
top_stories.append(article)
@@ -202,7 +201,7 @@ def buildOutput(newsSourceArr, webroot, scratch):
for i in range(len(h2RandomPairs)):
pair=h2RandomPairs[i]
article=newsSourceArr[pair[0]].h2Arr[pair[1]]
- img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100)
+ img_name = pullImage(article.img, image_index, webroot, 150, 100)
image_index += 1
article.img = img_name
middle_stories.append(article)
@@ -246,7 +245,7 @@ def printOutputHTML(outputHTML, outDir):
with open(os.path.join(outDir, filename), 'wb') as fp:
fp.write(data)
-def pullImage(url, index, webroot, scratch, target_width=350, target_height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
extension = url.split('.')[-1].split('?')[0]
img_name = 'img{}.{}'.format(index, extension)
res = requests.get(url)