Weekly Standard now uses new parser

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 22:05:39 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 22:05:39 -0600
commit: 511b3ba3f9de0d38e861833d6bcd7160487af111 (patch)
tree: 749aaf50248caf154a7b4bee52a1d742eb92fd31
parent: c0a52698826fba2aeb5c2889f3856f051db1052c (diff)
2 files changed, 69 insertions, 9 deletions
diff --git a/main.py b/main.py
index e26c8c2..948bf36 100644
--- a/main.py
+++ b/main.py
@@ -58,15 +58,8 @@ def run():
                                  None, None))
     
 
-    sourceList.append(NewsSource('Weekly Standard',
-                                 'http://www.weeklystandard.com/',
-                                 ['<div class="lead-photo">', 'href="'],
-                                 ['<div class="lead-photo">', 'href="'],
-                                 [],
-                                 '<div id="region_1"', '<div id="region_2"',
-                                 '<div class="widget lead-story layout-3col-feature" data-count="2">', '<div id="region_2"',
-                                 None, None))
-
+    wkl=buildWeeklyStandard()
+    sourceList.append(wkl)
 
     nyt=buildNYT()
     sourceList.append(nyt)
diff --git a/parser.py b/parser.py
index 16382ab..93ed020 100644
--- a/parser.py
+++ b/parser.py
@@ -94,6 +94,73 @@ def removeDuplicates(h1s, h2s, h3s):
     return h1s, h2s, h3s
     
 
+
+def buildWeeklyStandard():
+    url='http://www.weeklystandard.com'
+    name='Weekly Standard'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('<div id="region_1"', 1)[1]
+    h1=h1.split('<div id="region_2"', 1)[0]
+    h1=h1.split('<div class="lead-photo">', 1)[1]
+    h1=h1.split('href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('<div class="widget lead-story layout-3col-feature" data-count="2">', 1)[1]
+    h2=h2.split('<div id="region_2"', 1)[0]
+    while '<div class="lead-photo">' in h2:
+        h2=h2.split('<div class="lead-photo">', 1)[1]
+        h2=h2.split('href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('Today\'s Standard', 1)[1]
+    h3=h3.split('<div id="region_3"', 1)[0]
+    while '<div class="lead-photo">' in h3:
+        h3=h3.split('<div class="lead-photo">', 1)[1]
+        h3=h3.split('href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if h1 not in x:
+            h3s.append(x)
+
+    #Need to add URL prefix to all URLs
+    for i in range(len(h1s)):
+        h1s[i]=url+h1s[i]
+    for i in range(len(h2s)):
+        h2s[i]=url+h2s[i]
+    for i in range(len(h3s)):
+        h3s[i]=url+h3s[i]
+        
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+    wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    #REMOVE BAD STORIES
+    #if it's in the h1 slot, bump up the first h2 into the h1 slot
+    for h1 in wkl.h1Arr:
+        if 'Matt Labash' in h1.description:
+            wkl.h1Arr.remove(h1)
+            wkl.h1Arr.append(wkl.h2Arr[0])
+            wkl.h2Arr.remove(wkl.h2Arr[0])
+            print('removed '+h1.title)
+
+    return wkl
+
+
+
+
 def buildFoxNews():
     url='http://foxnews.com'
     name='Fox News'
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 22:05:39 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 22:05:39 -0600
commit	511b3ba3f9de0d38e861833d6bcd7160487af111 (patch)
tree	749aaf50248caf154a7b4bee52a1d742eb92fd31
parent	c0a52698826fba2aeb5c2889f3856f051db1052c (diff)