don't need a scratch directory any more

author: Matt Singleton <matt@xcolour.net> 2017-04-19 16:47:30 -0400
committer: Matt Singleton <matt@xcolour.net> 2017-04-19 16:47:30 -0400
commit: 8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 (patch)
tree: f7188ac5c7a4fb2a36f74c52699d03dfe5a783ce
parent: 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 (diff)
3 files changed, 54 insertions, 58 deletions
diff --git a/unbiased/main.py b/unbiased/main.py
index c760788..60211ea 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -19,14 +19,13 @@ logger.addHandler(ch)
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html')
-    parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace')
     args = parser.parse_args()
 
     crawl_frequency = 600
     while True:
         logger.info('Starting crawl')
         start = time.time()
-        run(args.webroot, args.scratch)
+        run(args.webroot)
         finish = time.time()
         runtime = finish - start
         sleeptime = crawl_frequency - runtime
@@ -34,7 +33,7 @@ def main():
         if sleeptime > 0:
             time.sleep(sleeptime)
 
-def run(webroot, scratch):
+def run(webroot):
     sourceList=[]
 
     '''
@@ -47,7 +46,6 @@ def run(webroot, scratch):
     '''
 
     logger.debug('Running with webroot="{}"'.format(webroot))
-    logger.debug('Running with scratch="{}"'.format(scratch))
 
 
     ### These values have to be the second half of the function name
@@ -65,7 +63,7 @@ def run(webroot, scratch):
                 possibles = globals().copy()
                 possibles.update(locals())
                 method = possibles.get(fn)
-                src=method(scratch)
+                src=method()
                 sourceList.append(src)
                 break
             except Exception as ex:
@@ -79,7 +77,7 @@ def run(webroot, scratch):
     newsSourceArr = sourceList
 
     #build the output file HTML
-    outputHTML=buildOutput(newsSourceArr, webroot, scratch)
+    outputHTML=buildOutput(newsSourceArr, webroot)
 
     #print the output file HTML
     printOutputHTML(outputHTML, webroot)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index 0a8398c..41727f5 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -3,7 +3,6 @@
 import logging
 import os
 import re
-import subprocess
 import urllib.parse
 
 import requests
@@ -18,7 +17,7 @@ logger = logging.getLogger('unbiased')
 Takes in a URL, downloads the file to a temp file,
 reads the file into a string, and returns that string
 '''
-def urlToContent(url, scratchDir, sourceEncoding='utf8'):
+def urlToContent(url, sourceEncoding='utf8'):
     res = requests.get(url)
     if res.status_code == 200:
         return res.text
@@ -31,7 +30,7 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs,
 calls the file scraper and appends the new Article object.
 Returns a newsSource2 object
 '''
-def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
 
     url_parts = urllib.parse.urlparse(url)
     scheme = url_parts.scheme
@@ -40,7 +39,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
     h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
 
     h1Arr=[]
-    a=buildArticle(h1URLs[0], name, scratchDir)
+    a=buildArticle(h1URLs[0], name)
     if a==None:
         logger.debug('H1 Nonetype in '+name)
     else:
@@ -48,7 +47,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
 
     h2Arr=[]
     for x in h2URLs:
-        a=buildArticle(x, name, scratchDir)
+        a=buildArticle(x, name)
         if a!=None:
             h2Arr.append(a)
         else:
@@ -56,7 +55,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
 
     h3Arr=[]
     for x in h3URLs:
-        a=buildArticle(x, name, scratchDir)
+        a=buildArticle(x, name)
         if a!=None:
             h3Arr.append(a)
         else:
@@ -161,12 +160,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b
 
 
 
-def buildTheHill(scratchDir):
+def buildTheHill():
     url='http://thehill.com'
     name='The Hill'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     h1=content
@@ -198,7 +197,7 @@ def buildTheHill(scratchDir):
         h3s.append(url+x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    hil=buildNewsSource2(name, url, h1s, h2s, h3s)
     hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None)
 
     return hil
@@ -207,14 +206,14 @@ def buildTheHill(scratchDir):
 
 
 
-def buildGuardian(scratchDir):
+def buildGuardian():
     url='http://www.theguardian.com/us'
     name='The Guardian US'
 
 
     while True:
         #DOWNLOAD HOMEPAGE CONTENT
-        content=urlToContent(url, scratchDir, 'utf8')
+        content=urlToContent(url, 'utf8')
         
         #get main headline
         h1=content
@@ -256,20 +255,20 @@ def buildGuardian(scratchDir):
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     
-    gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
     gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
 
     return gdn
 
 
 
-def buildWashTimes(scratchDir):
+def buildWashTimes():
     url='http://www.washingtontimes.com/'
     name='Washington Times'
 
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
     
     #get main headline
     h1=content
@@ -305,19 +304,19 @@ def buildWashTimes(scratchDir):
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-    wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    wat=buildNewsSource2(name, url, h1s, h2s, h3s)
     wat=removeBadStories(wat, None, None, None, None)
 
     return wat
 
 
-def buildCSM(scratchDir):
+def buildCSM():
     url='http://www.csmonitor.com/USA'
     name='Christian Science Monitor'
 
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #this makes sure we don't get '/USA' in the URL twice
     url=url.split('/USA')[0]
@@ -368,7 +367,7 @@ def buildCSM(scratchDir):
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-    csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    csm=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     badTitleArr=['Change Agent']
     badDescArr=None
@@ -388,7 +387,7 @@ in The Blaze articles by grabbing the first portion of the story instead
 def blazeFixDesc(articleArr):
     TAG_RE = re.compile(r'<[^>]+>')
     for i in range(len(articleArr)):
-        desc=urlToContent(articleArr[i].url, scratchDir)
+        desc=urlToContent(articleArr[i].url)
         desc=desc.split('<div class="entry-content article-styles">', 1)[1]
         desc=desc.split('<p>', 1)[1]
         desc=TAG_RE.sub('', desc)
@@ -400,12 +399,12 @@ def blazeFixDesc(articleArr):
     
 
 
-def buildBlaze(scratchDir):
+def buildBlaze():
     url='http://theblaze.com'
     name='The Blaze'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     h1=content
@@ -439,7 +438,7 @@ def buildBlaze(scratchDir):
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-    blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    blz=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     badTitleArr=['Tucker Carlson', 'Mark Levin']
     badDescArr=['Lawrence Jones', 'Mike Slater']
@@ -459,12 +458,12 @@ def buildBlaze(scratchDir):
 
 
 
-def buildCBS(scratchDir):
+def buildCBS():
     url='http://cbsnews.com'
     name='CBS News'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     h1=content
@@ -508,7 +507,7 @@ def buildCBS(scratchDir):
             h3s.append(url+x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
     cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart'])
 
     return cbs
@@ -517,12 +516,12 @@ def buildCBS(scratchDir):
 
 
 
-def buildNBC(scratchDir):    
+def buildNBC():    
     url='http://nbcnews.com'
     name='NBC News'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     h1=content
@@ -571,7 +570,7 @@ def buildNBC(scratchDir):
     '''
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
     nbc=removeBadStories(nbc, None, ['First Read'], None, None, None)
 
 
@@ -580,12 +579,12 @@ def buildNBC(scratchDir):
 
 
 
-def buildBBC(scratchDir):    
+def buildBBC():    
     url='http://www.bbc.com/news/world/us_and_canada'
     name='BBC US & Canada'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     h1=content
@@ -619,7 +618,7 @@ def buildBBC(scratchDir):
             h3s.append('http://www.bbc.com'+x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
     badTitleArr=None
     badDescArr=None
     badAuthorArr=None
@@ -642,12 +641,12 @@ def buildBBC(scratchDir):
 
 
 
-def buildWeeklyStandard(scratchDir):
+def buildWeeklyStandard():
     url='http://www.weeklystandard.com'
     name='Weekly Standard'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
     
     #get main headline
     h1=content
@@ -692,7 +691,7 @@ def buildWeeklyStandard(scratchDir):
         
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
     badTitleArr=None
@@ -707,12 +706,12 @@ def buildWeeklyStandard(scratchDir):
 
 
 
-def buildNPR(scratchDir):
+def buildNPR():
     url='http://www.npr.org/sections/news/'
     name='NPR'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
     
     #get main headline
     h1=content
@@ -746,7 +745,7 @@ def buildNPR(scratchDir):
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-    npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    npr=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
     badTitleArr=['The Two-Way']
@@ -761,12 +760,12 @@ def buildNPR(scratchDir):
 
 
 
-def buildABC(scratchDir):
+def buildABC():
     url='http://www.abcnews.go.com'
     name='ABC News'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
     
     #get main headline
     h1=content
@@ -800,7 +799,7 @@ def buildABC(scratchDir):
             h3s.append(x)
 
     h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
-    abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    abc=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
     badTitleArr=None
@@ -815,12 +814,12 @@ def buildABC(scratchDir):
 
 
 
-def buildFoxNews(scratchDir):
+def buildFoxNews():
     url='http://foxnews.com'
     name='Fox News'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
     
     #get main headline
     h1=content
@@ -854,7 +853,7 @@ def buildFoxNews(scratchDir):
     h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    fox=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
     badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
@@ -868,12 +867,12 @@ def buildFoxNews(scratchDir):
 
 
 
-def buildNYT(scratchDir):
+def buildNYT():
     url='http://www.nytimes.com'
     name='New York Times'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, scratchDir)
+    content=urlToContent(url)
 
     #get main headline
     #this will likely need if/else logic
@@ -951,7 +950,7 @@ def buildNYT(scratchDir):
             
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-    nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir)
+    nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
     nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter'])
 
     
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 0181beb..76c80b0 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -5,7 +5,6 @@ import os
 import pkgutil
 import random
 import re
-import subprocess
 import time
 import urllib.parse
 
@@ -17,7 +16,7 @@ from unbiased.unbiasedObjects import *
 logger = logging.getLogger('unbiased')
 
 #take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
 
     debugging=False
     if debugging:
@@ -142,7 +141,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
         return None
 
 
-def buildOutput(newsSourceArr, webroot, scratch):
+def buildOutput(newsSourceArr, webroot):
     #read in the template html file
     from jinja2 import Environment, PackageLoader, select_autoescape
     env = Environment(
@@ -193,7 +192,7 @@ def buildOutput(newsSourceArr, webroot, scratch):
         source=newsSourceArr[h1RandomSources[i]]
         randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
         article=source.h1Arr[randomArticle]
-        img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200)
+        img_name = pullImage(article.img, image_index, webroot, 350, 200)
         image_index += 1
         article.img = img_name
         top_stories.append(article)
@@ -202,7 +201,7 @@ def buildOutput(newsSourceArr, webroot, scratch):
     for i in range(len(h2RandomPairs)):
         pair=h2RandomPairs[i]
         article=newsSourceArr[pair[0]].h2Arr[pair[1]]
-        img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100)
+        img_name = pullImage(article.img, image_index, webroot, 150, 100)
         image_index += 1
         article.img = img_name
         middle_stories.append(article)
@@ -246,7 +245,7 @@ def printOutputHTML(outputHTML, outDir):
         with open(os.path.join(outDir, filename), 'wb') as fp:
             fp.write(data)
 
-def pullImage(url, index, webroot, scratch, target_width=350, target_height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
     extension = url.split('.')[-1].split('?')[0]
     img_name = 'img{}.{}'.format(index, extension)
     res = requests.get(url)
author	Matt Singleton <matt@xcolour.net>	2017-04-19 16:47:30 -0400
committer	Matt Singleton <matt@xcolour.net>	2017-04-19 16:47:30 -0400
commit	8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 (patch)
tree	f7188ac5c7a4fb2a36f74c52699d03dfe5a783ce
parent	7a8efb94dc2463a6d30afc77f10df78ebfa4c353 (diff)