summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-16 21:20:01 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-16 21:20:01 -0600
commit1b08ad4652091d529588f9fb75f7412a07d2dd28 (patch)
tree691ab2e5f01fc141a4e3dd182e57108db07134ba
parent53e8b692f6374b72238df797bf14e94f0567b331 (diff)
Some parsing tweaks, mostly for The Guardian
-rw-r--r--main.py14
-rw-r--r--parser.py21
-rw-r--r--unbiasedFunctions.py16
3 files changed, 40 insertions, 11 deletions
diff --git a/main.py b/main.py
index 3b39a73..ea1508f 100644
--- a/main.py
+++ b/main.py
@@ -19,7 +19,19 @@ def run():
#nyt=buildNYT()
#sourceList.append(nyt)
- gdn=buildGuardian()
+ #for some reason, The Guardian sometimes just doesn't work right?
+ #loop until it gets it right
+ h1='https://www.theguardian.com/us'
+ looped=False
+ while h1=='https://www.theguardian.com/us':
+ try:
+ gdn=buildGuardian()
+ h1=gdn.h1Arr[0]
+ except:
+ print('The Guardian: build error. Looping again.')
+ if looped:
+ print('Guardian loop')
+ looped=True
sourceList.append(gdn)
blz=buildBlaze()
diff --git a/parser.py b/parser.py
index e6257da..41972cd 100644
--- a/parser.py
+++ b/parser.py
@@ -180,7 +180,7 @@ def buildGuardian():
#get main headline
h1=content
- h1=h1.split('<h1 ', 1)[1]
+ h1=h1.split('<h1', 1)[1]
h1=h1.split('<a href="', 1)[1]
h1=h1.split('"', 1)[0]
h1s=[h1]
@@ -205,13 +205,12 @@ def buildGuardian():
while '<h2 class="fc-item__title"><a href="' in h3:
h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
x=h3.split('"', 1)[0]
- if h1 not in x:
- h3s.append(x)
+ h3s.append(x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
- #gdn=removeBadStories(blz, None, None, None)
+ gdn=removeBadStories(gdn, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
@@ -359,7 +358,9 @@ def buildNBC():
h1=h1.split('panel_hero', 1)[1]
h1=h1.split('<a href="', 1)[1]
h1=h1.split('"', 1)[0]
- h1s=[url+h1]
+ if '.com' not in h1:
+ h1=url+h1
+ h1s=[h1]
#GET SECONDARY HEADLINES
h2=content
@@ -371,7 +372,9 @@ def buildNBC():
h2=h2.split('<a href="', 1)[1]
x=h2.split('"', 1)[0]
if h1 not in x:
- h2s.append(url+x)
+ if '.com' not in x:
+ x=url+x
+ h2s.append(x)
#GET TERTIARY HEADLINES
h3=content
@@ -383,7 +386,9 @@ def buildNBC():
h3=h3.split('<a href="', 1)[1]
x=h3.split('"', 1)[0]
if h1 not in x:
- h3s.append(url+x)
+ if '.com' not in x:
+ x=url+x
+ h3s.append(x)
#adjust for today.com urls
for arr in [h1s, h2s, h3s]:
@@ -510,7 +515,7 @@ def buildWeeklyStandard():
#REMOVE BAD STORIES
## if flagged again, remove Micah Mattix
badDescArr=['Matt Labash']
- badAuthorArr=['MATT LABASH', 'TWS PODCAST']
+ badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN']
badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr)
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index de27228..748aed7 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -25,8 +25,20 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
if sourceName=='The Guardian':
#The Guardian puts an identifying banner on their og:images
#grab the main image from the page instead
- img=content.split('<img class="maxed', 1)[1]
- img=img.split('src="', 1)[1].split('"')[0]
+
+ #scenario 1: regular image
+ if '<img class="maxed' in content:
+ img=content.split('<img class="maxed', 1)[1]
+ img=img.split('src="', 1)[1].split('"')[0]
+ #scenario 2: video in image spot
+ elif '<meta itemprop="image"' in content:
+ img=content.split('<meta itemprop="image"', 1)[1]
+ img=img.split('content="', 1)[1].split('"')[0]
+ #scenario 3: photo essays
+ elif '<img class="immersive-main-media__media"' in content:
+ img=content.split('<img class="immersive-main-media__media"', 1)[1]
+ img=img.split('src="', 1)[1].split('"')[0]
+
else:
img=content.split('og:image" content=')[1][1:].split('>')[0]
if img[-1]=='/':