diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 22:22:36 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 22:22:36 -0600 |
commit | f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (patch) | |
tree | b465bb35f9103b81e30199987953560eb71b7a62 | |
parent | f9d6e23131f7699df9358f22c37cb6e7521dba7a (diff) |
Added NPR
-rw-r--r-- | main.py | 3 | ||||
-rw-r--r-- | parser.py | 53 |
2 files changed, 55 insertions, 1 deletions
@@ -19,6 +19,9 @@ def run(): #nyt=buildNYT() #sourceList.append(nyt) + npr=buildNPR() + sourceList.append(npr) + #for some reason, The Guardian sometimes just doesn't work right? #loop until it gets it right h1='https://www.theguardian.com/us' @@ -508,7 +508,7 @@ def buildWeeklyStandard(): badTitleArr=None ## if flagged again, remove Micah Mattix badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner'] badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) @@ -517,6 +517,57 @@ def buildWeeklyStandard(): +def buildNPR(): + url='http://npr.com' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('<div id="contentWrap">', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('<article class="hp-item attachment volume-low">', 1)[1] + h2=h2.split('</section>', 1)[0] + while 'href="' in h2: + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('<ul id="nib-list">', 1)[1] + h3=h3.split('</ul>', 1)[0] + while 'href=\'' in h3: + h3=h3.split('href=\'', 1)[1] + x=h3.split('\'', 1)[0] + if h1 not in x: + h3s.append(x) + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + npr=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + badDescArr=None + badAuthorArr=None + badImgArr=None + #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return npr + + + + def buildFoxNews(): url='http://foxnews.com' name='Fox News' |