diff options
-rwxr-xr-x | unbiased/main.py | 19 | ||||
-rwxr-xr-x | unbiased/parser.py | 25 | ||||
-rw-r--r-- | unbiased/unbiasedFunctions.py | 34 | ||||
-rw-r--r-- | unbiased/unbiasedObjects.py | 6 |
4 files changed, 47 insertions, 37 deletions
diff --git a/unbiased/main.py b/unbiased/main.py index 88ceb7e..ea5412d 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -1,12 +1,20 @@ #!/usr/bin/env python3 import argparse +import logging import time from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import * from unbiased.parser import * +logger = logging.getLogger('unbiased') +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) +logger.addHandler(ch) + def main(): parser = argparse.ArgumentParser() @@ -15,9 +23,9 @@ def main(): args = parser.parse_args() while True: - print('-----------------------') + logger.info('Starting crawl') run(args.webroot, args.scratch) - print('-----------------------') + logger.info('Crawl complete. Sleeping for 600s') time.sleep(600) def run(webroot, scratch): @@ -32,8 +40,8 @@ def run(webroot, scratch): ''' - print('running with webroot="{}"'.format(webroot)) - print('running with scratch="{}"'.format(scratch)) + logger.debug('Running with webroot="{}"'.format(webroot)) + logger.debug('Running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -53,8 +61,7 @@ def run(webroot, scratch): sourceList.append(src) break except Exception as ex: - print(ex) - print('Build error. Looping again: '+source) + logger.error('Build error. Looping again. source={} ex={}'.format(source, ex)) tries+=1 time.sleep(tries) diff --git a/unbiased/parser.py b/unbiased/parser.py index f068ae8..2bba27d 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3
+import logging
import os
import re
import subprocess
@@ -7,6 +8,8 @@ import subprocess from unbiased.unbiasedObjects import *
from unbiased.unbiasedFunctions import buildArticle
+logger = logging.getLogger('unbiased')
+
'''
Takes in a URL, downloads the file to a temp file,
@@ -39,7 +42,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h1Arr=[]
a=buildArticle(h1URLs[0], name, scratchDir)
if a==None:
- print('................\nH1 Nonetype in '+name+'\n................')
+ logger.debug('H1 Nonetype in '+name)
else:
h1Arr.append(a)
@@ -49,7 +52,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): if a!=None:
h2Arr.append(a)
else:
- print('................\nH2 Nonetype in '+name+'\n................')
+ logger.debug('H2 Nonetype in '+name)
h3Arr=[]
@@ -58,7 +61,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): if a!=None:
h3Arr.append(a)
else:
- print('................\nH3 Nonetype in '+name+'\n................')
+ logger.debug('H3 Nonetype in '+name)
#BUILD THE NEWS SOURCE
newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
@@ -119,13 +122,11 @@ def removeDuplicates(h1s, h2s, h3s): def removalNotification(source, title, reason, value):
- print('*************************')
- print('\t\tSTORY REMOVED')
- print('SOURCE: '+source)
- print('TITLE: \t'+title)
- print('REASON: '+reason)
- print('VALUE: \t'+value)
- print('*************************\n\n')
+ logger.debug("""Story removed
+ SOURCE:\t{}
+ TITLE:\t{})
+ REASON:\t{}
+ VALUE:\t{}""".format(source, title, reason, value))
def removeBadStoriesHelper(source, element, badStringList, arr):
@@ -133,7 +134,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr): for i in range(len(arr)):
for hed in arr[i]:
if hed==None:
- print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////")
+ logger.debug("None type found in removeBadStoriesHelper for "+source.name)
break
for item in badStringList:
if item in getattr(hed, element):
@@ -225,7 +226,7 @@ def buildGuardian(scratchDir): if h1!='https://www.theguardian.com/us':
break
else:
- print('Guardian loop')
+ logger.debug('Guardian loop')
h1s=[h1]
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 16ea07d..775346f 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,3 +1,4 @@ +import logging
import os
import pkgutil
import random
@@ -9,15 +10,15 @@ from unbiased.unbiasedObjects import * from PIL import Image
+logger = logging.getLogger('unbiased')
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
- print(sourceName)
- print(url)
- print()
+ logger.debug(sourceName)
+ logger.debug(url)
temp_article = os.path.join(scratchDir, 'temp_article.html')
@@ -60,7 +61,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t img=img[:-1]
if debugging:
- print(img)
+ logger.debug(img)
title=content.split('og:title" content=')[1][1:].split('>')[0]
if title[-1]=='/':
@@ -68,7 +69,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t title=title[:-1]
if debugging:
- print(title)
+ logger.debug(title)
author=''
@@ -90,7 +91,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t break
if debugging:
- print(author)
+ logger.debug(author)
if 'og:description' in content:
@@ -104,7 +105,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t description=re.sub('<[^<]+?>', '', description)
description=description[1:200]
else:
- print("SHOULDN'T GET HERE")
+ logger.debug("SHOULDN'T GET HERE")
#strip out self-references
description=description.replace(sourceName+"'s", '***')
@@ -112,18 +113,16 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t description=description.replace(sourceName, '***')
if debugging:
- print(description)
+ logger.debug(description)
a=Article(title, url, img, description, sourceName, author)
return a
except Exception:
- print('^^^^^^^^^^^^^^^^^^^^^^^^^')
- print('\tARTICLE PARSING ERROR')
- print('SOURCE: '+sourceName)
- print('URL: \t'+url)
- print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n')
+ logger.error("""ARTICLE PARSING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(sourceName, url))
return None
@@ -144,7 +143,7 @@ def buildOutput(newsSourceArr, webroot): if x not in h1RandomSources:
h1RandomSources.append(x)
else:
- print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H1 stories in '+newsSourceArr[x].name)
#For h2s and h3s, select N random sources (can repeat), then
#a non-repetitive random article from within
@@ -157,19 +156,18 @@ def buildOutput(newsSourceArr, webroot): if not pair in h2RandomPairs:
h2RandomPairs.append(pair)
else:
- print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H2 stories in '+newsSourceArr[x].name)
h3RandomPairs=[]
while len(h3RandomPairs) < 12:
x=random.sample(range(len(newsSourceArr)), 1)[0]
- print(newsSourceArr[x].name)
if len(newsSourceArr[x].h3Arr) > 0:
y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0]
pair=[x,y]
if not pair in h3RandomPairs:
h3RandomPairs.append(pair)
else:
- print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+ logger.debug('No H3 stories in '+newsSourceArr[x].name)
# collect articles for each section
image_index = 0
@@ -203,7 +201,7 @@ def buildOutput(newsSourceArr, webroot): for i in range(len(newsSourceArr)-1):
sourcesStr+=newsSourceArr[i].name+', '
sourcesStr+=newsSourceArr[-1].name
- print('Successfully parsed: '+sourcesStr)
+ logger.info('Successfully parsed: '+sourcesStr)
timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py index 3affbe6..9372d3a 100644 --- a/unbiased/unbiasedObjects.py +++ b/unbiased/unbiasedObjects.py @@ -1,3 +1,7 @@ +import logging
+
+logger = logging.getLogger('unbiased')
+
class Article():
title=''
url=''
@@ -86,5 +90,5 @@ class NewsSource(): elif level==3:
self.h3Arr.append(article)
else:
- print("Error: invalid level in NewsSource.addArtlce: ", level)
+ logger.error("Invalid level in NewsSource.addArtlce: " + level)
|