Added NPR

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-16 22:22:36 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-16 22:22:36 -0600
commit: f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (patch)
tree: b465bb35f9103b81e30199987953560eb71b7a62
parent: f9d6e23131f7699df9358f22c37cb6e7521dba7a (diff)
2 files changed, 55 insertions, 1 deletions
diff --git a/main.py b/main.py
index ea1508f..5025fdc 100644
--- a/main.py
+++ b/main.py
@@ -19,6 +19,9 @@ def run():
     #nyt=buildNYT()
     #sourceList.append(nyt)
 
+    npr=buildNPR()
+    sourceList.append(npr)
+
     #for some reason, The Guardian sometimes just doesn't work right?
     #loop until it gets it right
     h1='https://www.theguardian.com/us'
diff --git a/parser.py b/parser.py
index ea318ca..40532f7 100644
--- a/parser.py
+++ b/parser.py
@@ -508,7 +508,7 @@ def buildWeeklyStandard():
     badTitleArr=None
     ## if flagged again, remove Micah Mattix
     badDescArr=['Matt Labash']
-    badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN']
+    badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner']
     badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
     wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
 
@@ -517,6 +517,57 @@ def buildWeeklyStandard():
 
 
 
+def buildNPR():
+    url='http://npr.com'
+    name='NPR'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('<div id="contentWrap">', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('<article class="hp-item attachment volume-low">', 1)[1]
+    h2=h2.split('</section>', 1)[0]
+    while 'href="' in h2:
+        h2=h2.split('href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('<ul id="nib-list">', 1)[1]
+    h3=h3.split('</ul>', 1)[0]
+    while 'href=\'' in h3:
+        h3=h3.split('href=\'', 1)[1]
+        x=h3.split('\'', 1)[0]
+        if h1 not in x:
+            h3s.append(x)
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+    npr=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    #REMOVE BAD STORIES
+    badTitleArr=None
+    badDescArr=None
+    badAuthorArr=None
+    badImgArr=None
+    #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
+    return npr
+
+
+
+
 def buildFoxNews():
     url='http://foxnews.com'
     name='Fox News'
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-16 22:22:36 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-16 22:22:36 -0600
commit	f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (patch)
tree	b465bb35f9103b81e30199987953560eb71b7a62
parent	f9d6e23131f7699df9358f22c37cb6e7521dba7a (diff)