Fixed NPR parsing; put NYT back in; Mobile CSS

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-18 17:10:13 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-18 17:10:13 -0600
commit: f19dd7a3291e2d61d4d76eef5300df522193fa1e (patch)
tree: c2f37c5d2b19b65e47ca33f2575726da073472af /parser.py
parent: f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (diff)
1 files changed, 30 insertions, 16 deletions
diff --git a/parser.py b/parser.py
index 40532f7..5cb1c51 100644
--- a/parser.py
+++ b/parser.py
@@ -100,7 +100,7 @@ def removeDuplicates(h1s, h2s, h3s):
 
 
 
-def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr):
+def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
 
     arr=[source.h1Arr, source.h2Arr, source.h3Arr]
 
@@ -159,6 +159,19 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr):
                             arr[1].remove(arr[1][0])
                         print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Image ('+item+')\n')
                     
+    if badURLArr!=None:
+        for i in range(len(arr)):
+            for hed in arr[i]:
+                for item in badURLArr:
+                    if item in hed.url:
+                        arr[i].remove(hed)
+                        #if it's in the h1 slot, bump up the 
+                        #  first h2 into the h1 slot
+                        if i==0:
+                            arr[0].append(arr[1][0])
+                            arr[1].remove(arr[1][0])
+                        print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: URL ('+item+')\n')
+                    
     return source
 
 
@@ -508,7 +521,7 @@ def buildWeeklyStandard():
     badTitleArr=None
     ## if flagged again, remove Micah Mattix
     badDescArr=['Matt Labash']
-    badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner']
+    badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY']
     badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
     wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
 
@@ -518,7 +531,7 @@ def buildWeeklyStandard():
 
 
 def buildNPR():
-    url='http://npr.com'
+    url='http://www.npr.org/sections/news/'
     name='NPR'
 
     #DOWNLOAD HOMEPAGE CONTENT
@@ -526,7 +539,7 @@ def buildNPR():
     
     #get main headline
     h1=content
-    h1=h1.split('<div id="contentWrap">', 1)[1]
+    h1=h1.split('<a id="mainContent">', 1)[1]
     h1=h1.split('<a href="', 1)[1]
     h1=h1.split('"', 1)[0]
     h1s=[h1]
@@ -534,10 +547,11 @@ def buildNPR():
     #GET SECONDARY HEADLINES
     h2=content
     h2s=[]
-    h2=h2.split('<article class="hp-item attachment volume-low">', 1)[1]
-    h2=h2.split('</section>', 1)[0]
-    while 'href="' in h2:
-        h2=h2.split('href="', 1)[1]
+    h2=h2.split('<article class="item has-image">', 1)[1]
+    h2=h2.split('<!-- END CLASS=\'FEATURED-3-UP\' -->', 1)[0]
+    while '<article class="item has-image">' in h2:
+        h2=h2.split('<article class="item has-image">', 1)[1]
+        h2=h2.split('<a href="', 1)[1]
         x=h2.split('"', 1)[0]
         if h1 not in x:
             h2s.append(x)
@@ -545,15 +559,16 @@ def buildNPR():
     #GET TERTIARY HEADLINES
     h3=content
     h3s=[]
-    h3=h3.split('<ul id="nib-list">', 1)[1]
-    h3=h3.split('</ul>', 1)[0]
-    while 'href=\'' in h3:
-        h3=h3.split('href=\'', 1)[1]
-        x=h3.split('\'', 1)[0]
+    h3=h3.split('<div id="overflow" class="list-overflow"', 1)[1]
+    h3=h3.split('<!-- END ID="OVERFLOW" CLASS="LIST-OVERFLOW"', 1)[0]
+    while '<h2 class="title"><a href="' in h3:
+        h3=h3.split('<h2 class="title"><a href="', 1)[1]
+        x=h3.split('"', 1)[0]
         if h1 not in x:
             h3s.append(x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
     npr=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
@@ -611,7 +626,8 @@ def buildFoxNews():
     badDescArr=None
     badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
     badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
-    fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+    badURLArr=['http://www.foxnews.com/opinion']
+    fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
 
     return fox
 
@@ -673,8 +689,6 @@ def buildNYT():
         if (h1 not in x) and (x not in h2s):
             h2s.append(x)
 
-    print(h2s)
-
     #GET TERTIARY HEADLINES
     h3=content
     h3s=[]
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-18 17:10:13 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-18 17:10:13 -0600
commit	f19dd7a3291e2d61d4d76eef5300df522193fa1e (patch)
tree	c2f37c5d2b19b65e47ca33f2575726da073472af /parser.py
parent	f03c0b7c0eb7f607fe271d1e36ec869ee8caca57 (diff)