From f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Thu, 16 Feb 2017 22:22:36 -0600 Subject: Added NPR --- parser.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'parser.py') diff --git a/parser.py b/parser.py index ea318ca..40532f7 100644 --- a/parser.py +++ b/parser.py @@ -508,7 +508,7 @@ def buildWeeklyStandard(): badTitleArr=None ## if flagged again, remove Micah Mattix badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner'] badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) @@ -517,6 +517,57 @@ def buildWeeklyStandard(): +def buildNPR(): + url='http://npr.com' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('
', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while 'href="' in h2: + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('
    ', 1)[1] + h3=h3.split('
', 1)[0] + while 'href=\'' in h3: + h3=h3.split('href=\'', 1)[1] + x=h3.split('\'', 1)[0] + if h1 not in x: + h3s.append(x) + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + npr=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + badDescArr=None + badAuthorArr=None + badImgArr=None + #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return npr + + + + def buildFoxNews(): url='http://foxnews.com' name='Fox News' -- cgit v1.2.3