From a596eb07939b8e653355be0020f0a7d8c0ee2d80 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Thu, 9 Mar 2017 09:38:57 -0500 Subject: Reworked Guardian --- main.py | 18 +++++++++--------- parser.py | 32 +++++++++++++++++++------------- unbiasedFunctions.py | 15 ++++++++++----- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/main.py b/main.py index 23dcb5f..182ae26 100755 --- a/main.py +++ b/main.py @@ -23,15 +23,6 @@ def run(): ''' - hil=buildTheHill() - sourceList.append(hil) - - #nyt=buildNYT() - #sourceList.append(nyt) - - npr=buildNPR() - sourceList.append(npr) - #for some reason, The Guardian sometimes just doesn't work right? #loop until it gets it right ''' @@ -48,6 +39,15 @@ def run(): gdn=buildGuardian() sourceList.append(gdn) + hil=buildTheHill() + sourceList.append(hil) + + #nyt=buildNYT() + #sourceList.append(nyt) + + npr=buildNPR() + sourceList.append(npr) + blz=buildBlaze() sourceList.append(blz) diff --git a/parser.py b/parser.py index be40a3b..a54f033 100755 --- a/parser.py +++ b/parser.py @@ -10,12 +10,15 @@ import re Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' -def urlToContent(url): +def urlToContent(url, sourceEncoding='utf8'): #download file os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) #read file - f=open('scratch/temp1.html', 'r')#, encoding="utf8") + if sourceEncoding=='utf8': + f=open('scratch/temp1.html', 'r', encoding="utf8") + else: + f=open('scratch/temp1.html', 'r', encoding="latin-1") content=f.read() f.close() @@ -235,12 +238,12 @@ def buildTheHill(): def buildGuardian(): - url='http://www.theguardian.com/us-news' - name='The Guardian' + url='http://www.theguardian.com/us' + name='The Guardian US' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - + content=urlToContent(url, 'utf8') + #get main headline h1=content h1=h1.split('')[2:] for x in h2: - x=x.split('

', 1)[1] - h3=h3.split('