diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 16:02:24 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 16:02:24 -0600 |
commit | 53e8b692f6374b72238df797bf14e94f0567b331 (patch) | |
tree | aa6bfb1eb1fbe35bd97028d122d30685e591729e | |
parent | 0f2b265ed3c82e674cc714785a9f308be1db9aa1 (diff) |
Added The Guardian to sources
-rw-r--r-- | main.py | 3 | ||||
-rw-r--r-- | parser.py | 48 | ||||
-rw-r--r-- | unbiasedFunctions.py | 17 |
3 files changed, 62 insertions, 6 deletions
@@ -19,6 +19,9 @@ def run(): #nyt=buildNYT() #sourceList.append(nyt) + gdn=buildGuardian() + sourceList.append(gdn) + blz=buildBlaze() sourceList.append(blz) @@ -170,6 +170,52 @@ def removeBadStories(source, badDescArr, badAuthorArr, badImgArr): return source + +def buildGuardian(): + url='http://www.theguardian.com/us-news' + name='The Guardian' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('<h1 ', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + #only the h1 and the two h2s have this, so split on it and grab + #the second two + h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:] + for x in h2: + x=x.split('<h2 class="fc-item__title"><a href="', 1)[1] + x=x.split('"', 1)[0] + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('<div class="fc-slice-wrapper">', 1)[1] + h3=h3.split('<div class="js-show-more-placeholder">', 1)[0] + #this story section goes on forever; just grab the first 5 + while '<h2 class="fc-item__title"><a href="' in h3: + h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + + gdn=buildNewsSource2(name, url, h1s, h2s, h3s) + #gdn=removeBadStories(blz, None, None, None) + + return gdn + + ''' Function to fix the oddly short og:descriptions provided in The Blaze articles by grabbing the first portion of the story instead @@ -464,7 +510,7 @@ def buildWeeklyStandard(): #REMOVE BAD STORIES ## if flagged again, remove Micah Mattix badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST'] badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr) diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index 57c8c6a..de27228 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -22,11 +22,18 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im f.close()
try:
- #because the quote separator could be ' or ", trim to just before it then lop it off
- img=content.split('og:image" content=')[1][1:].split('>')[0]
- if img[-1]=='/':
- img=img[:-1].strip()
- img=img[:-1]
+ if sourceName=='The Guardian':
+ #The Guardian puts an identifying banner on their og:images
+ #grab the main image from the page instead
+ img=content.split('<img class="maxed', 1)[1]
+ img=img.split('src="', 1)[1].split('"')[0]
+ else:
+ img=content.split('og:image" content=')[1][1:].split('>')[0]
+ if img[-1]=='/':
+ #because the quote separator could be ' or ",
+ #trim to just before it then lop it off
+ img=img[:-1].strip()
+ img=img[:-1]
title=content.split('og:title" content=')[1][1:].split('>')[0]
if title[-1]=='/':
|