From f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Mon, 6 Mar 2017 18:12:59 -0500 Subject: Added spotCheck ability. Other minor tweaks --- main.py | 7 +++++-- parser.py | 24 ++++++++++++++++-------- spotCheck.py | 38 ++++++++++++++++++++++++++++++++++++++ unbiasedFunctions.py | 4 +++- 4 files changed, 62 insertions(+), 11 deletions(-) create mode 100755 spotCheck.py diff --git a/main.py b/main.py index 5f9830f..23dcb5f 100755 --- a/main.py +++ b/main.py @@ -26,14 +26,15 @@ def run(): hil=buildTheHill() sourceList.append(hil) - nyt=buildNYT() - sourceList.append(nyt) + #nyt=buildNYT() + #sourceList.append(nyt) npr=buildNPR() sourceList.append(npr) #for some reason, The Guardian sometimes just doesn't work right? #loop until it gets it right + ''' h1='https://www.theguardian.com/us' looped=False while h1=='https://www.theguardian.com/us': @@ -43,6 +44,8 @@ def run(): except: print('The Guardian: build error. Looping again.') looped=True + ''' + gdn=buildGuardian() sourceList.append(gdn) blz=buildBlaze() diff --git a/parser.py b/parser.py index 0426df1..be40a3b 100755 --- a/parser.py +++ b/parser.py @@ -226,7 +226,7 @@ def buildTheHill(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) hil=buildNewsSource2(name, url, h1s, h2s, h3s) - hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp'], None, None) + hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams'], None, None) return hil @@ -235,7 +235,7 @@ def buildTheHill(): def buildGuardian(): - url='http://www.theguardian.com/us' + url='http://www.theguardian.com/us-news' name='The Guardian' #DOWNLOAD HOMEPAGE CONTENT @@ -253,7 +253,7 @@ def buildGuardian(): h2s=[] #only the h1 and the two h2s have this, so split on it and grab #the second two - h2=h2.split('
', 3)[2:] + h2=h2.split('
')[2:] for x in h2: x=x.split('