Reworked Guardian

author: ssstvinc2 <sstvinc2@gmail.com> 2017-03-09 09:38:57 -0500
committer: ssstvinc2 <sstvinc2@gmail.com> 2017-03-09 09:38:57 -0500
commit: a596eb07939b8e653355be0020f0a7d8c0ee2d80 (patch)
tree: 5c36379ffc2411e3aab56a8a64edd2b8a16c9fef
parent: f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (diff)
3 files changed, 38 insertions, 27 deletions
diff --git a/main.py b/main.py
index 23dcb5f..182ae26 100755
--- a/main.py
+++ b/main.py
@@ -23,15 +23,6 @@ def run():
 
     '''
 
-    hil=buildTheHill()
-    sourceList.append(hil)
-
-    #nyt=buildNYT()
-    #sourceList.append(nyt)
-
-    npr=buildNPR()
-    sourceList.append(npr)
-
     #for some reason, The Guardian sometimes just doesn't work right?
     #loop until it gets it right
     '''
@@ -48,6 +39,15 @@ def run():
     gdn=buildGuardian()
     sourceList.append(gdn)
 
+    hil=buildTheHill()
+    sourceList.append(hil)
+
+    #nyt=buildNYT()
+    #sourceList.append(nyt)
+
+    npr=buildNPR()
+    sourceList.append(npr)
+
     blz=buildBlaze()
     sourceList.append(blz)
 
diff --git a/parser.py b/parser.py
index be40a3b..a54f033 100755
--- a/parser.py
+++ b/parser.py
@@ -10,12 +10,15 @@ import re
 Takes in a URL, downloads the file to a temp file,
 reads the file into a string, and returns that string
 '''
-def urlToContent(url):
+def urlToContent(url, sourceEncoding='utf8'):
     #download file
     os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
     
     #read file
-    f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+    if sourceEncoding=='utf8':
+        f=open('scratch/temp1.html', 'r', encoding="utf8")
+    else:
+        f=open('scratch/temp1.html', 'r', encoding="latin-1")
     content=f.read()
     f.close()
 
@@ -235,12 +238,12 @@ def buildTheHill():
 
 
 def buildGuardian():
-    url='http://www.theguardian.com/us-news'
-    name='The Guardian'
+    url='http://www.theguardian.com/us'
+    name='The Guardian US'
 
     #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-
+    content=urlToContent(url, 'utf8')
+    
     #get main headline
     h1=content
     h1=h1.split('<h1', 1)[1]
@@ -255,15 +258,18 @@ def buildGuardian():
     #the second two
     h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
     for x in h2:
-        x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
-        x=x.split('"', 1)[0]
-        h2s.append(x)
+        if '<h2 class="fc-item__title"><a href="' in x:
+            x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
+            x=x.split('"', 1)[0]
+            h2s.append(x)
+        else:
+            break
 
     #GET TERTIARY HEADLINES
     h3=content
     h3s=[]
     h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
-    h3=h3.split('<div class="js-show-more-placeholder">', 1)[0]
+    h3=h3.split('<div class="fc-container__inner">', 1)[0]#'<div class="js-show-more-placeholder">', 1)[0]
     #this story section goes on forever; just grab the first 5
     while '<h2 class="fc-item__title"><a href="' in h3:
         h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
@@ -271,7 +277,7 @@ def buildGuardian():
         h3s.append(x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-    
+
     gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
     gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
 
@@ -339,7 +345,7 @@ def buildBlaze():
 
 
     blz=buildNewsSource2(name, url, h1s, h2s, h3s)
-    blz=removeBadStories(blz, None, ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+    blz=removeBadStories(blz, ['Tucker Carlson', 'Mark Levin'], ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
 
     #The Blaze has dumb, short description fields, so we need to grab
     #the first x characters of actual article text instead
@@ -645,7 +651,7 @@ def buildNPR():
     badDescArr=None
     badAuthorArr=None
     badImgArr=None
-    #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+    npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
 
     return npr
 
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 4d2019c..950e16d 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -6,7 +6,7 @@ import re
 
 
 #take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
 
     debugging=False
     if debugging:
@@ -124,15 +124,20 @@ def buildOutput(newsSourceArr):
     
     #set the random order for sources
     h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
+    
     #For h2s and h3s, select N random sources (can repeat), then
     #a non-repetitive random article from within 
     h2RandomPairs=[]
     while len(h2RandomPairs) < 6:
         x=random.sample(range(len(newsSourceArr)), 1)[0]
-        y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
-        pair=[x,y]
-        if not pair in h2RandomPairs:
-            h2RandomPairs.append(pair)
+        if len(newsSourceArr[x].h2Arr) > 0:
+            y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
+            pair=[x,y]
+            if not pair in h2RandomPairs:
+                h2RandomPairs.append(pair)
+        else:
+            print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+
     h3RandomPairs=[]
     while len(h3RandomPairs) < 12:
         x=random.sample(range(len(newsSourceArr)), 1)[0]
author	ssstvinc2 <sstvinc2@gmail.com>	2017-03-09 09:38:57 -0500
committer	ssstvinc2 <sstvinc2@gmail.com>	2017-03-09 09:38:57 -0500
commit	a596eb07939b8e653355be0020f0a7d8c0ee2d80 (patch)
tree	5c36379ffc2411e3aab56a8a64edd2b8a16c9fef
parent	f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (diff)