diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 21:20:01 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-16 21:20:01 -0600 |
commit | 1b08ad4652091d529588f9fb75f7412a07d2dd28 (patch) | |
tree | 691ab2e5f01fc141a4e3dd182e57108db07134ba | |
parent | 53e8b692f6374b72238df797bf14e94f0567b331 (diff) |
Some parsing tweaks, mostly for The Guardian
-rw-r--r-- | main.py | 14 | ||||
-rw-r--r-- | parser.py | 21 | ||||
-rw-r--r-- | unbiasedFunctions.py | 16 |
3 files changed, 40 insertions, 11 deletions
@@ -19,7 +19,19 @@ def run(): #nyt=buildNYT() #sourceList.append(nyt) - gdn=buildGuardian() + #for some reason, The Guardian sometimes just doesn't work right? + #loop until it gets it right + h1='https://www.theguardian.com/us' + looped=False + while h1=='https://www.theguardian.com/us': + try: + gdn=buildGuardian() + h1=gdn.h1Arr[0] + except: + print('The Guardian: build error. Looping again.') + if looped: + print('Guardian loop') + looped=True sourceList.append(gdn) blz=buildBlaze() @@ -180,7 +180,7 @@ def buildGuardian(): #get main headline h1=content - h1=h1.split('<h1 ', 1)[1] + h1=h1.split('<h1', 1)[1] h1=h1.split('<a href="', 1)[1] h1=h1.split('"', 1)[0] h1s=[h1] @@ -205,13 +205,12 @@ def buildGuardian(): while '<h2 class="fc-item__title"><a href="' in h3: h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1] x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) + h3s.append(x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) gdn=buildNewsSource2(name, url, h1s, h2s, h3s) - #gdn=removeBadStories(blz, None, None, None) + gdn=removeBadStories(gdn, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None) return gdn @@ -359,7 +358,9 @@ def buildNBC(): h1=h1.split('panel_hero', 1)[1] h1=h1.split('<a href="', 1)[1] h1=h1.split('"', 1)[0] - h1s=[url+h1] + if '.com' not in h1: + h1=url+h1 + h1s=[h1] #GET SECONDARY HEADLINES h2=content @@ -371,7 +372,9 @@ def buildNBC(): h2=h2.split('<a href="', 1)[1] x=h2.split('"', 1)[0] if h1 not in x: - h2s.append(url+x) + if '.com' not in x: + x=url+x + h2s.append(x) #GET TERTIARY HEADLINES h3=content @@ -383,7 +386,9 @@ def buildNBC(): h3=h3.split('<a href="', 1)[1] x=h3.split('"', 1)[0] if h1 not in x: - h3s.append(url+x) + if '.com' not in x: + x=url+x + h3s.append(x) #adjust for today.com urls for arr in [h1s, h2s, h3s]: @@ -510,7 +515,7 @@ def buildWeeklyStandard(): #REMOVE BAD STORIES ## if flagged again, remove Micah Mattix badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN'] badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr) diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index de27228..748aed7 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -25,8 +25,20 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im if sourceName=='The Guardian':
#The Guardian puts an identifying banner on their og:images
#grab the main image from the page instead
- img=content.split('<img class="maxed', 1)[1]
- img=img.split('src="', 1)[1].split('"')[0]
+
+ #scenario 1: regular image
+ if '<img class="maxed' in content:
+ img=content.split('<img class="maxed', 1)[1]
+ img=img.split('src="', 1)[1].split('"')[0]
+ #scenario 2: video in image spot
+ elif '<meta itemprop="image"' in content:
+ img=content.split('<meta itemprop="image"', 1)[1]
+ img=img.split('content="', 1)[1].split('"')[0]
+ #scenario 3: photo essays
+ elif '<img class="immersive-main-media__media"' in content:
+ img=content.split('<img class="immersive-main-media__media"', 1)[1]
+ img=img.split('src="', 1)[1].split('"')[0]
+
else:
img=content.split('og:image" content=')[1][1:].split('>')[0]
if img[-1]=='/':
|