diff options
author | Matt Singleton <matt@xcolour.net> | 2017-04-19 16:47:30 -0400 |
---|---|---|
committer | Matt Singleton <matt@xcolour.net> | 2017-04-19 16:47:30 -0400 |
commit | 8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 (patch) | |
tree | f7188ac5c7a4fb2a36f74c52699d03dfe5a783ce | |
parent | 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 (diff) |
don't need a scratch directory any more
-rwxr-xr-x | unbiased/main.py | 10 | ||||
-rwxr-xr-x | unbiased/parser.py | 91 | ||||
-rw-r--r-- | unbiased/unbiasedFunctions.py | 11 |
3 files changed, 54 insertions, 58 deletions
diff --git a/unbiased/main.py b/unbiased/main.py index c760788..60211ea 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -19,14 +19,13 @@ logger.addHandler(ch) def main(): parser = argparse.ArgumentParser() parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') - parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace') args = parser.parse_args() crawl_frequency = 600 while True: logger.info('Starting crawl') start = time.time() - run(args.webroot, args.scratch) + run(args.webroot) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -34,7 +33,7 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot, scratch): +def run(webroot): sourceList=[] ''' @@ -47,7 +46,6 @@ def run(webroot, scratch): ''' logger.debug('Running with webroot="{}"'.format(webroot)) - logger.debug('Running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -65,7 +63,7 @@ def run(webroot, scratch): possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method(scratch) + src=method() sourceList.append(src) break except Exception as ex: @@ -79,7 +77,7 @@ def run(webroot, scratch): newsSourceArr = sourceList #build the output file HTML - outputHTML=buildOutput(newsSourceArr, webroot, scratch) + outputHTML=buildOutput(newsSourceArr, webroot) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/parser.py b/unbiased/parser.py index 0a8398c..41727f5 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -3,7 +3,6 @@ import logging
import os
import re
-import subprocess
import urllib.parse
import requests
@@ -18,7 +17,7 @@ logger = logging.getLogger('unbiased') Takes in a URL, downloads the file to a temp file,
reads the file into a string, and returns that string
'''
-def urlToContent(url, scratchDir, sourceEncoding='utf8'):
+def urlToContent(url, sourceEncoding='utf8'):
res = requests.get(url)
if res.status_code == 200:
return res.text
@@ -31,7 +30,7 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object.
Returns a newsSource2 object
'''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
url_parts = urllib.parse.urlparse(url)
scheme = url_parts.scheme
@@ -40,7 +39,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
h1Arr=[]
- a=buildArticle(h1URLs[0], name, scratchDir)
+ a=buildArticle(h1URLs[0], name)
if a==None:
logger.debug('H1 Nonetype in '+name)
else:
@@ -48,7 +47,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h2Arr=[]
for x in h2URLs:
- a=buildArticle(x, name, scratchDir)
+ a=buildArticle(x, name)
if a!=None:
h2Arr.append(a)
else:
@@ -56,7 +55,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3Arr=[]
for x in h3URLs:
- a=buildArticle(x, name, scratchDir)
+ a=buildArticle(x, name)
if a!=None:
h3Arr.append(a)
else:
@@ -161,12 +160,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b -def buildTheHill(scratchDir):
+def buildTheHill():
url='http://thehill.com'
name='The Hill'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -198,7 +197,7 @@ def buildTheHill(scratchDir): h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ hil=buildNewsSource2(name, url, h1s, h2s, h3s)
hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
return hil
@@ -207,14 +206,14 @@ def buildTheHill(scratchDir): -def buildGuardian(scratchDir):
+def buildGuardian():
url='http://www.theguardian.com/us'
name='The Guardian US'
while True:
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir, 'utf8')
+ content=urlToContent(url, 'utf8')
#get main headline
h1=content
@@ -256,20 +255,20 @@ def buildGuardian(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
-def buildWashTimes(scratchDir):
+def buildWashTimes():
url='http://www.washingtontimes.com/'
name='Washington Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -305,19 +304,19 @@ def buildWashTimes(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ wat=buildNewsSource2(name, url, h1s, h2s, h3s)
wat=removeBadStories(wat, None, None, None, None)
return wat
-def buildCSM(scratchDir):
+def buildCSM():
url='http://www.csmonitor.com/USA'
name='Christian Science Monitor'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#this makes sure we don't get '/USA' in the URL twice
url=url.split('/USA')[0]
@@ -368,7 +367,7 @@ def buildCSM(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ csm=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Change Agent']
badDescArr=None
@@ -388,7 +387,7 @@ in The Blaze articles by grabbing the first portion of the story instead def blazeFixDesc(articleArr):
TAG_RE = re.compile(r'<[^>]+>')
for i in range(len(articleArr)):
- desc=urlToContent(articleArr[i].url, scratchDir)
+ desc=urlToContent(articleArr[i].url)
desc=desc.split('<div class="entry-content article-styles">', 1)[1]
desc=desc.split('<p>', 1)[1]
desc=TAG_RE.sub('', desc)
@@ -400,12 +399,12 @@ def blazeFixDesc(articleArr): -def buildBlaze(scratchDir):
+def buildBlaze():
url='http://theblaze.com'
name='The Blaze'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -439,7 +438,7 @@ def buildBlaze(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ blz=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Tucker Carlson', 'Mark Levin']
badDescArr=['Lawrence Jones', 'Mike Slater']
@@ -459,12 +458,12 @@ def buildBlaze(scratchDir): -def buildCBS(scratchDir):
+def buildCBS():
url='http://cbsnews.com'
name='CBS News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -508,7 +507,7 @@ def buildCBS(scratchDir): h3s.append(url+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
return cbs
@@ -517,12 +516,12 @@ def buildCBS(scratchDir): -def buildNBC(scratchDir):
+def buildNBC():
url='http://nbcnews.com'
name='NBC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -571,7 +570,7 @@ def buildNBC(scratchDir): '''
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
@@ -580,12 +579,12 @@ def buildNBC(scratchDir): -def buildBBC(scratchDir):
+def buildBBC():
url='http://www.bbc.com/news/world/us_and_canada'
name='BBC US & Canada'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -619,7 +618,7 @@ def buildBBC(scratchDir): h3s.append('http://www.bbc.com'+x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=None
badDescArr=None
badAuthorArr=None
@@ -642,12 +641,12 @@ def buildBBC(scratchDir): -def buildWeeklyStandard(scratchDir):
+def buildWeeklyStandard():
url='http://www.weeklystandard.com'
name='Weekly Standard'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -692,7 +691,7 @@ def buildWeeklyStandard(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=None
@@ -707,12 +706,12 @@ def buildWeeklyStandard(scratchDir): -def buildNPR(scratchDir):
+def buildNPR():
url='http://www.npr.org/sections/news/'
name='NPR'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -746,7 +745,7 @@ def buildNPR(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ npr=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=['The Two-Way']
@@ -761,12 +760,12 @@ def buildNPR(scratchDir): -def buildABC(scratchDir):
+def buildABC():
url='http://www.abcnews.go.com'
name='ABC News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -800,7 +799,7 @@ def buildABC(scratchDir): h3s.append(x)
h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
- abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ abc=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=None
@@ -815,12 +814,12 @@ def buildABC(scratchDir): -def buildFoxNews(scratchDir):
+def buildFoxNews():
url='http://foxnews.com'
name='Fox News'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
h1=content
@@ -854,7 +853,7 @@ def buildFoxNews(scratchDir): h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker']
@@ -868,12 +867,12 @@ def buildFoxNews(scratchDir): -def buildNYT(scratchDir):
+def buildNYT():
url='http://www.nytimes.com'
name='New York Times'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, scratchDir)
+ content=urlToContent(url)
#get main headline
#this will likely need if/else logic
@@ -951,7 +950,7 @@ def buildNYT(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
- nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+ nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 0181beb..76c80b0 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -5,7 +5,6 @@ import os import pkgutil
import random
import re
-import subprocess
import time
import urllib.parse
@@ -17,7 +16,7 @@ from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased')
#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
@@ -142,7 +141,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None
-def buildOutput(newsSourceArr, webroot, scratch):
+def buildOutput(newsSourceArr, webroot):
#read in the template html file
from jinja2 import Environment, PackageLoader, select_autoescape
env = Environment(
@@ -193,7 +192,7 @@ def buildOutput(newsSourceArr, webroot, scratch): source=newsSourceArr[h1RandomSources[i]]
randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
article=source.h1Arr[randomArticle]
- img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200)
+ img_name = pullImage(article.img, image_index, webroot, 350, 200)
image_index += 1
article.img = img_name
top_stories.append(article)
@@ -202,7 +201,7 @@ def buildOutput(newsSourceArr, webroot, scratch): for i in range(len(h2RandomPairs)):
pair=h2RandomPairs[i]
article=newsSourceArr[pair[0]].h2Arr[pair[1]]
- img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100)
+ img_name = pullImage(article.img, image_index, webroot, 150, 100)
image_index += 1
article.img = img_name
middle_stories.append(article)
@@ -246,7 +245,7 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp:
fp.write(data)
-def pullImage(url, index, webroot, scratch, target_width=350, target_height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
extension = url.split('.')[-1].split('?')[0]
img_name = 'img{}.{}'.format(index, extension)
res = requests.get(url)
|