From e3d744821919dedcf4f8466c72587008c062acbc Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Thu, 16 Feb 2017 10:27:51 -0600 Subject: Pulled NYT again; minor fixes for NBC, Blaze --- main.py | 4 ++-- parser.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 296de05..9120906 100644 --- a/main.py +++ b/main.py @@ -16,8 +16,8 @@ def run(): sourceList=[] - nyt=buildNYT() - sourceList.append(nyt) + #nyt=buildNYT() + #sourceList.append(nyt) blz=buildBlaze() sourceList.append(blz) diff --git a/parser.py b/parser.py index ef90eee..31c09da 100644 --- a/parser.py +++ b/parser.py @@ -225,7 +225,7 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) blz=buildNewsSource2(name, url, h1s, h2s, h3s) - blz=removeBadStories(blz, None, ['Tomi Lahren'], None) + blz=removeBadStories(blz, None, ['Tomi Lahren', 'Dana Loesch'], None) #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead @@ -323,6 +323,11 @@ def buildNBC(): if h1 not in x: h3s.append(url+x) + #adjust for today.com urls + for arr in [h1s, h2s, h3s]: + for i in range(len(arr)): + if 'today.com' in arr[i]: + arr[i]=arr[i].split('.com', 1)[1] h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) nbc=buildNewsSource2(name, url, h1s, h2s, h3s) @@ -549,6 +554,8 @@ def buildNYT(): if (h1 not in x) and (x not in h2s): h2s.append(x) + print(h2s) + #GET TERTIARY HEADLINES h3=content h3s=[] -- cgit v1.2.3