added washington times

author: ssstvinc2 <sstvinc2@gmail.com> 2017-03-23 17:00:05 -0400
committer: ssstvinc2 <sstvinc2@gmail.com> 2017-03-23 17:00:05 -0400
commit: 79b293fdc9da9abe9399c727e08efb1b32fd4337 (patch)
tree: 23660bddfcc6b2d03c91aacb57be47463fdcfb58
parent: 85f03a6d410295e1a59c6a8b579a32d9dbfe50ea (diff)
3 files changed, 59 insertions, 3 deletions
diff --git a/main.py b/main.py
index c54487e..735ff6b 100755
--- a/main.py
+++ b/main.py
@@ -21,14 +21,17 @@ def run():
     SOURCES TO ADD NEXT:
     -ABC
     -REUTERS
+    -Christian Science Monitor
+    -Town Hall
+    -Washington Times
 
     '''
 
 
     ### These values have to be the second half of the function name
     ### E.g. Guardian calls buildGuardian(), etc.
-    sourceFnArr=['Guardian', 'TheHill', 'NPR', 'Blaze', 'BBC', 'NBC', 'CBS',
-                 'FoxNews', ]
+    sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS',
+                 'FoxNews', 'WashTimes'] #'Blaze'
     
     for source in sourceFnArr:
         tries=0
diff --git a/parser.py b/parser.py
index 19333e8..942612a 100755
--- a/parser.py
+++ b/parser.py
@@ -248,6 +248,58 @@ def buildGuardian():
     return gdn
 
 
+
+def buildWashTimes():
+    url='http://www.washingtontimes.com/'
+    name='Washington Times'
+
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('top-news', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+
+    h1s=[url+h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    #only the h1 and the two h2s have this, so split on it and grab
+    #the second two
+    h2=h2.split('class="top-news', 1)[1]
+    h2=h2.split('</article>', 1)[1] #end of top-news article
+    h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
+    h2=h2.split('<article>')[1:]
+    
+    for x in h2:
+        x=x.split('<a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        h2s.append(url+x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('more-from desktop-only', 1)[1]
+    h3=h3.split('</section>', 1)[0]
+    h3=h3.split('<a href="')[1:]
+    
+    for x in h3:
+        x=x.split('"', 1)[0]
+        h3s.append(url+x)
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+    wat=buildNewsSource2(name, url, h1s, h2s, h3s)
+    wat=removeBadStories(wat, None, None, None, None)
+
+    return wat
+
+
+
 '''
 Function to fix the oddly short og:descriptions provided
 in The Blaze articles by grabbing the first portion of the story instead
diff --git a/spotCheck.py b/spotCheck.py
index 5c0e54d..7bf46bb 100755
--- a/spotCheck.py
+++ b/spotCheck.py
@@ -14,7 +14,8 @@ def spotCheck(src):
            'gdn' : buildGuardian,
            'blz' : buildBlaze,
            'bbc' : buildBBC,
-           'nbc' : buildNBC}
+           'nbc' : buildNBC,
+           'wat' : buildWashTimes}
 
     data=fns[src]()
author	ssstvinc2 <sstvinc2@gmail.com>	2017-03-23 17:00:05 -0400
committer	ssstvinc2 <sstvinc2@gmail.com>	2017-03-23 17:00:05 -0400
commit	79b293fdc9da9abe9399c727e08efb1b32fd4337 (patch)
tree	23660bddfcc6b2d03c91aacb57be47463fdcfb58
parent	85f03a6d410295e1a59c6a8b579a32d9dbfe50ea (diff)