diff options
author | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-23 17:00:05 -0400 |
---|---|---|
committer | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-23 17:00:05 -0400 |
commit | 79b293fdc9da9abe9399c727e08efb1b32fd4337 (patch) | |
tree | 23660bddfcc6b2d03c91aacb57be47463fdcfb58 | |
parent | 85f03a6d410295e1a59c6a8b579a32d9dbfe50ea (diff) |
added washington times
-rwxr-xr-x | main.py | 7 | ||||
-rwxr-xr-x | parser.py | 52 | ||||
-rwxr-xr-x | spotCheck.py | 3 |
3 files changed, 59 insertions, 3 deletions
@@ -21,14 +21,17 @@ def run(): SOURCES TO ADD NEXT: -ABC -REUTERS + -Christian Science Monitor + -Town Hall + -Washington Times ''' ### These values have to be the second half of the function name ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr=['Guardian', 'TheHill', 'NPR', 'Blaze', 'BBC', 'NBC', 'CBS', - 'FoxNews', ] + sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', + 'FoxNews', 'WashTimes'] #'Blaze' for source in sourceFnArr: tries=0 @@ -248,6 +248,58 @@ def buildGuardian(): return gdn
+
+def buildWashTimes():
+ url='http://www.washingtontimes.com/'
+ name='Washington Times'
+
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('top-news', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ #only the h1 and the two h2s have this, so split on it and grab
+ #the second two
+ h2=h2.split('class="top-news', 1)[1]
+ h2=h2.split('</article>', 1)[1] #end of top-news article
+ h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
+ h2=h2.split('<article>')[1:]
+
+ for x in h2:
+ x=x.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('more-from desktop-only', 1)[1]
+ h3=h3.split('</section>', 1)[0]
+ h3=h3.split('<a href="')[1:]
+
+ for x in h3:
+ x=x.split('"', 1)[0]
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+ wat=buildNewsSource2(name, url, h1s, h2s, h3s)
+ wat=removeBadStories(wat, None, None, None, None)
+
+ return wat
+
+
+
'''
Function to fix the oddly short og:descriptions provided
in The Blaze articles by grabbing the first portion of the story instead
diff --git a/spotCheck.py b/spotCheck.py index 5c0e54d..7bf46bb 100755 --- a/spotCheck.py +++ b/spotCheck.py @@ -14,7 +14,8 @@ def spotCheck(src): 'gdn' : buildGuardian, 'blz' : buildBlaze, 'bbc' : buildBBC, - 'nbc' : buildNBC} + 'nbc' : buildNBC, + 'wat' : buildWashTimes} data=fns[src]() |