Added The Guardian to sources

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-16 16:02:24 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-16 16:02:24 -0600
commit: 53e8b692f6374b72238df797bf14e94f0567b331 (patch)
tree: aa6bfb1eb1fbe35bd97028d122d30685e591729e
parent: 0f2b265ed3c82e674cc714785a9f308be1db9aa1 (diff)
3 files changed, 62 insertions, 6 deletions
diff --git a/main.py b/main.py
index 9120906..3b39a73 100644
--- a/main.py
+++ b/main.py
@@ -19,6 +19,9 @@ def run():
     #nyt=buildNYT()
     #sourceList.append(nyt)
 
+    gdn=buildGuardian()
+    sourceList.append(gdn)
+
     blz=buildBlaze()
     sourceList.append(blz)
 
diff --git a/parser.py b/parser.py
index d12b1c2..e6257da 100644
--- a/parser.py
+++ b/parser.py
@@ -170,6 +170,52 @@ def removeBadStories(source, badDescArr, badAuthorArr, badImgArr):
     return source
 
 
+
+def buildGuardian():
+    url='http://www.theguardian.com/us-news'
+    name='The Guardian'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+
+    #get main headline
+    h1=content
+    h1=h1.split('<h1 ', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    #only the h1 and the two h2s have this, so split on it and grab
+    #the second two
+    h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:]
+    for x in h2:
+        x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        h2s.append(x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
+    h3=h3.split('<div class="js-show-more-placeholder">', 1)[0]
+    #this story section goes on forever; just grab the first 5
+    while '<h2 class="fc-item__title"><a href="' in h3:
+        h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if h1 not in x:
+            h3s.append(x)
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+    gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
+    #gdn=removeBadStories(blz, None, None, None)
+
+    return gdn
+
+
 '''
 Function to fix the oddly short og:descriptions provided
 in The Blaze articles by grabbing the first portion of the story instead
@@ -464,7 +510,7 @@ def buildWeeklyStandard():
     #REMOVE BAD STORIES
     ## if flagged again, remove Micah Mattix
     badDescArr=['Matt Labash']
-    badAuthorArr=['MATT LABASH']
+    badAuthorArr=['MATT LABASH', 'TWS PODCAST']
     badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
     wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr)
 
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 57c8c6a..de27228 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -22,11 +22,18 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
     f.close()
 
     try:
-        #because the quote separator could be ' or ", trim to just before it then lop it off
-        img=content.split('og:image" content=')[1][1:].split('>')[0]
-        if img[-1]=='/':
-            img=img[:-1].strip()
-        img=img[:-1]
+        if sourceName=='The Guardian':
+            #The Guardian puts an identifying banner on their og:images
+            #grab the main image from the page instead
+            img=content.split('<img class="maxed', 1)[1]
+            img=img.split('src="', 1)[1].split('"')[0]
+        else:
+            img=content.split('og:image" content=')[1][1:].split('>')[0]
+            if img[-1]=='/':
+                #because the quote separator could be ' or ", 
+                #trim to just before it then lop it off
+                img=img[:-1].strip()
+            img=img[:-1]
 
         title=content.split('og:title" content=')[1][1:].split('>')[0]
         if title[-1]=='/':
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-16 16:02:24 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-16 16:02:24 -0600
commit	53e8b692f6374b72238df797bf14e94f0567b331 (patch)
tree	aa6bfb1eb1fbe35bd97028d122d30685e591729e
parent	0f2b265ed3c82e674cc714785a9f308be1db9aa1 (diff)