From 0ce27f6e13a139c2fe06082dfb10a35d213fc7a7 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Fri, 24 Feb 2017 00:08:56 -0500 Subject: minor tweaks, re-enabled NYT and GDN --- parser.py | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'parser.py') diff --git a/parser.py b/parser.py index 2c22a87..0426df1 100755 --- a/parser.py +++ b/parser.py @@ -93,7 +93,8 @@ def removeDuplicates(h1s, h2s, h3s): if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): removeArr.append(h3s[i]) for x in removeArr: - h3s.remove(x) + if x in h3s: + h3s.remove(x) return h1s, h2s, h3s @@ -225,7 +226,7 @@ def buildTheHill(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) hil=buildNewsSource2(name, url, h1s, h2s, h3s) - #hil=removeBadStories(gdn, None, None, None, None) + hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp'], None, None) return hil @@ -234,7 +235,7 @@ def buildTheHill(): def buildGuardian(): - url='http://www.theguardian.com/us-news' + url='http://www.theguardian.com/us' name='The Guardian' #DOWNLOAD HOMEPAGE CONTENT @@ -270,7 +271,7 @@ def buildGuardian(): h3s.append(x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - + gdn=buildNewsSource2(name, url, h1s, h2s, h3s) gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None) @@ -338,7 +339,7 @@ def buildBlaze(): blz=buildNewsSource2(name, url, h1s, h2s, h3s) - blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None) + blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None) #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead @@ -400,6 +401,7 @@ def buildCBS(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) cbs=buildNewsSource2(name, url, h1s, h2s, h3s) + cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, None) return cbs @@ -460,6 +462,8 @@ def buildNBC(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) nbc=buildNewsSource2(name, url, h1s, h2s, h3s) + nbc=removeBadStories(nbc, None, ['First Read'], None, None, None) + return nbc @@ -709,12 +713,12 @@ def buildNYT(): else: #otherwise, pull the first story from the A column h1=h1.split('
', 1)[1] + h1=h1.split('
', 1)[1] - h2=h2.split('', 1)[0] + h3=content + h3=h3.split('