diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 23:33:56 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 23:33:56 -0600 |
commit | 233eb048a9bc2c4b84e1ae6a47de6b088779ee4e (patch) | |
tree | 95681c7f50d434f4b8380f17656135324632c6a6 /parser.py | |
parent | 38483987b2389b92ca06ac1b409f358ecd4fa991 (diff) |
Fixed NYT, plus other parsing fixes and a minor visual tweak
Diffstat (limited to 'parser.py')
-rw-r--r-- | parser.py | 19 |
1 files changed, 14 insertions, 5 deletions
@@ -225,6 +225,8 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=removeBadStories(blz, None, ['Tomi Lahren'], None) + #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead blz.h1Arr=blazeFixDesc(blz.h1Arr) @@ -502,10 +504,17 @@ def buildNYT(): #this will likely need if/else logic h1=content - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] + if 'story theme-summary banner' in h1: + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + else: + #otherwise, pull the first story from the A column + h1=h1.split('<div class="a-column column">', 1)[1] + h1=h1.split('<a href="', 1)[1].split('"', 1)[0] + h1s=[h1] + #GET SECONDARY HEADLINES #This comes from the a column or b column, above the break @@ -557,7 +566,7 @@ def buildNYT(): if (h1 not in x) and (x not in h3s): h3s.append(x) - h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) nyt=buildNewsSource2(name, url, h1s, h2s, h3s) return nyt |