diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 10:27:51 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 10:27:51 -0600 |
commit | e3d744821919dedcf4f8466c72587008c062acbc (patch) | |
tree | ee75ac6061228a4af2a253d179ccd8c8e71dd3f4 | |
parent | 233eb048a9bc2c4b84e1ae6a47de6b088779ee4e (diff) |
Pulled NYT again; minor fixes for NBC, Blaze
-rw-r--r-- | main.py | 4 | ||||
-rw-r--r-- | parser.py | 9 |
2 files changed, 10 insertions, 3 deletions
@@ -16,8 +16,8 @@ def run(): sourceList=[] - nyt=buildNYT() - sourceList.append(nyt) + #nyt=buildNYT() + #sourceList.append(nyt) blz=buildBlaze() sourceList.append(blz) @@ -225,7 +225,7 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) blz=buildNewsSource2(name, url, h1s, h2s, h3s) - blz=removeBadStories(blz, None, ['Tomi Lahren'], None) + blz=removeBadStories(blz, None, ['Tomi Lahren', 'Dana Loesch'], None) #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead @@ -323,6 +323,11 @@ def buildNBC(): if h1 not in x: h3s.append(url+x) + #adjust for today.com urls + for arr in [h1s, h2s, h3s]: + for i in range(len(arr)): + if 'today.com' in arr[i]: + arr[i]=arr[i].split('.com', 1)[1] h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) nbc=buildNewsSource2(name, url, h1s, h2s, h3s) @@ -549,6 +554,8 @@ def buildNYT(): if (h1 not in x) and (x not in h2s): h2s.append(x) + print(h2s) + #GET TERTIARY HEADLINES h3=content h3s=[] |