summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xunbiased/main.py19
-rwxr-xr-xunbiased/parser.py25
-rw-r--r--unbiased/unbiasedFunctions.py34
-rw-r--r--unbiased/unbiasedObjects.py6
4 files changed, 47 insertions, 37 deletions
diff --git a/unbiased/main.py b/unbiased/main.py
index 88ceb7e..ea5412d 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -1,12 +1,20 @@
#!/usr/bin/env python3
import argparse
+import logging
import time
from unbiased.unbiasedObjects import *
from unbiased.unbiasedFunctions import *
from unbiased.parser import *
+logger = logging.getLogger('unbiased')
+logger.setLevel(logging.DEBUG)
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
+logger.addHandler(ch)
+
def main():
parser = argparse.ArgumentParser()
@@ -15,9 +23,9 @@ def main():
args = parser.parse_args()
while True:
- print('-----------------------')
+ logger.info('Starting crawl')
run(args.webroot, args.scratch)
- print('-----------------------')
+ logger.info('Crawl complete. Sleeping for 600s')
time.sleep(600)
def run(webroot, scratch):
@@ -32,8 +40,8 @@ def run(webroot, scratch):
'''
- print('running with webroot="{}"'.format(webroot))
- print('running with scratch="{}"'.format(scratch))
+ logger.debug('Running with webroot="{}"'.format(webroot))
+ logger.debug('Running with scratch="{}"'.format(scratch))
### These values have to be the second half of the function name
@@ -53,8 +61,7 @@ def run(webroot, scratch):
sourceList.append(src)
break
except Exception as ex:
- print(ex)
- print('Build error. Looping again: '+source)
+ logger.error('Build error. Looping again. source={} ex={}'.format(source, ex))
tries+=1
time.sleep(tries)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index f068ae8..2bba27d 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
+import logging
import os
import re
import subprocess
@@ -7,6 +8,8 @@ import subprocess
from unbiased.unbiasedObjects import *
from unbiased.unbiasedFunctions import buildArticle
+logger = logging.getLogger('unbiased')
+
'''
Takes in a URL, downloads the file to a temp file,
@@ -39,7 +42,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
h1Arr=[]
a=buildArticle(h1URLs[0], name, scratchDir)
if a==None:
- print('................\nH1 Nonetype in '+name+'\n................')
+ logger.debug('H1 Nonetype in '+name)
else:
h1Arr.append(a)
@@ -49,7 +52,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
if a!=None:
h2Arr.append(a)
else:
- print('................\nH2 Nonetype in '+name+'\n................')
+ logger.debug('H2 Nonetype in '+name)
h3Arr=[]
@@ -58,7 +61,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
if a!=None:
h3Arr.append(a)
else:
- print('................\nH3 Nonetype in '+name+'\n................')
+ logger.debug('H3 Nonetype in '+name)
#BUILD THE NEWS SOURCE
newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
@@ -119,13 +122,11 @@ def removeDuplicates(h1s, h2s, h3s):
def removalNotification(source, title, reason, value):
- print('*************************')
- print('\t\tSTORY REMOVED')
- print('SOURCE: '+source)
- print('TITLE: \t'+title)
- print('REASON: '+reason)
- print('VALUE: \t'+value)
- print('*************************\n\n')
+ logger.debug("""Story removed
+ SOURCE:\t{}
+ TITLE:\t{})
+ REASON:\t{}
+ VALUE:\t{}""".format(source, title, reason, value))
def removeBadStoriesHelper(source, element, badStringList, arr):
@@ -133,7 +134,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr):
for i in range(len(arr)):
for hed in arr[i]:
if hed==None:
- print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////")
+ logger.debug("None type found in removeBadStoriesHelper for "+source.name)
break
for item in badStringList:
if item in getattr(hed, element):
@@ -225,7 +226,7 @@ def buildGuardian(scratchDir):
if h1!='https://www.theguardian.com/us':
break
else:
- print('Guardian loop')
+ logger.debug('Guardian loop')
h1s=[h1]
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 16ea07d..775346f 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -1,3 +1,4 @@
+import logging
import os
import pkgutil
import random
@@ -9,15 +10,15 @@ from unbiased.unbiasedObjects import *
from PIL import Image
+logger = logging.getLogger('unbiased')
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
- print(sourceName)
- print(url)
- print()
+ logger.debug(sourceName)
+ logger.debug(url)
temp_article = os.path.join(scratchDir, 'temp_article.html')
@@ -60,7 +61,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
img=img[:-1]
if debugging:
- print(img)
+ logger.debug(img)
title=content.split('og:title" content=')[1][1:].split('>')[0]
if title[-1]=='/':
@@ -68,7 +69,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
title=title[:-1]
if debugging:
- print(title)
+ logger.debug(title)
author=''
@@ -90,7 +91,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
break
if debugging:
- print(author)
+ logger.debug(author)
if 'og:description' in content:
@@ -104,7 +105,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
description=re.sub('<[^<]+?>', '', description)
description=description[1:200]
else:
- print("SHOULDN'T GET HERE")
+ logger.debug("SHOULDN'T GET HERE")
#strip out self-references
description=description.replace(sourceName+"'s", '***')
@@ -112,18 +113,16 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
description=description.replace(sourceName, '***')
if debugging:
- print(description)
+ logger.debug(description)
a=Article(title, url, img, description, sourceName, author)
return a
except Exception:
- print('^^^^^^^^^^^^^^^^^^^^^^^^^')
- print('\tARTICLE PARSING ERROR')
- print('SOURCE: '+sourceName)
- print('URL: \t'+url)
- print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n')
+ logger.error("""ARTICLE PARSING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(sourceName, url))
return None
@@ -144,7 +143,7 @@ def buildOutput(newsSourceArr, webroot):
if x not in h1RandomSources:
h1RandomSources.append(x)
else:
- print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H1 stories in '+newsSourceArr[x].name)
#For h2s and h3s, select N random sources (can repeat), then
#a non-repetitive random article from within
@@ -157,19 +156,18 @@ def buildOutput(newsSourceArr, webroot):
if not pair in h2RandomPairs:
h2RandomPairs.append(pair)
else:
- print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H2 stories in '+newsSourceArr[x].name)
h3RandomPairs=[]
while len(h3RandomPairs) < 12:
x=random.sample(range(len(newsSourceArr)), 1)[0]
- print(newsSourceArr[x].name)
if len(newsSourceArr[x].h3Arr) > 0:
y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
pair=[x,y]
if not pair in h3RandomPairs:
h3RandomPairs.append(pair)
else:
- print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H3 stories in '+newsSourceArr[x].name)
# collect articles for each section
image_index = 0
@@ -203,7 +201,7 @@ def buildOutput(newsSourceArr, webroot):
for i in range(len(newsSourceArr)-1):
sourcesStr+=newsSourceArr[i].name+', '
sourcesStr+=newsSourceArr[-1].name
- print('Successfully parsed: '+sourcesStr)
+ logger.info('Successfully parsed: '+sourcesStr)
timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py
index 3affbe6..9372d3a 100644
--- a/unbiased/unbiasedObjects.py
+++ b/unbiased/unbiasedObjects.py
@@ -1,3 +1,7 @@
+import logging
+
+logger = logging.getLogger('unbiased')
+
class Article():
title=''
url=''
@@ -86,5 +90,5 @@ class NewsSource():
elif level==3:
self.h3Arr.append(article)
else:
- print("Error: invalid level in NewsSource.addArtlce: ", level)
+ logger.error("Invalid level in NewsSource.addArtlce: " + level)