summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-16 22:22:36 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-16 22:22:36 -0600
commitf03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (patch)
treeb465bb35f9103b81e30199987953560eb71b7a62
parentf9d6e23131f7699df9358f22c37cb6e7521dba7a (diff)
Added NPR
-rw-r--r--main.py3
-rw-r--r--parser.py53
2 files changed, 55 insertions, 1 deletions
diff --git a/main.py b/main.py
index ea1508f..5025fdc 100644
--- a/main.py
+++ b/main.py
@@ -19,6 +19,9 @@ def run():
#nyt=buildNYT()
#sourceList.append(nyt)
+ npr=buildNPR()
+ sourceList.append(npr)
+
#for some reason, The Guardian sometimes just doesn't work right?
#loop until it gets it right
h1='https://www.theguardian.com/us'
diff --git a/parser.py b/parser.py
index ea318ca..40532f7 100644
--- a/parser.py
+++ b/parser.py
@@ -508,7 +508,7 @@ def buildWeeklyStandard():
badTitleArr=None
## if flagged again, remove Micah Mattix
badDescArr=['Matt Labash']
- badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN']
+ badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner']
badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
@@ -517,6 +517,57 @@ def buildWeeklyStandard():
+def buildNPR():
+ url='http://npr.com'
+ name='NPR'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<div id="contentWrap">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<article class="hp-item attachment volume-low">', 1)[1]
+ h2=h2.split('</section>', 1)[0]
+ while 'href="' in h2:
+ h2=h2.split('href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<ul id="nib-list">', 1)[1]
+ h3=h3.split('</ul>', 1)[0]
+ while 'href=\'' in h3:
+ h3=h3.split('href=\'', 1)[1]
+ x=h3.split('\'', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ npr=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE BAD STORIES
+ badTitleArr=None
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=None
+ #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
+ return npr
+
+
+
+
def buildFoxNews():
url='http://foxnews.com'
name='Fox News'