Fixed NYT, plus other parsing fixes and a minor visual tweak

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-15 23:33:56 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-15 23:33:56 -0600
commit: 233eb048a9bc2c4b84e1ae6a47de6b088779ee4e (patch)
tree: 95681c7f50d434f4b8380f17656135324632c6a6
parent: 38483987b2389b92ca06ac1b409f358ecd4fa991 (diff)
4 files changed, 37 insertions, 17 deletions
diff --git a/html_template/unbiased.css b/html_template/unbiased.css
index 86f653a..90c604a 100644
--- a/html_template/unbiased.css
+++ b/html_template/unbiased.css
@@ -69,7 +69,7 @@ a:link, a:visited, a:hover, a:active {
     vertical-align:top;
     text-align:left;
     width:360px;
-    height:322px;
+    height:352px;
     overflow:hidden;
     background:#fff;
     margin:10px;
diff --git a/main.py b/main.py
index 92f96ae..296de05 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,10 @@ def main():
 def run():
     sourceList=[]
 
+
+    nyt=buildNYT()
+    sourceList.append(nyt)
+
     blz=buildBlaze()
     sourceList.append(blz)
 
@@ -30,9 +34,6 @@ def run():
     wkl=buildWeeklyStandard()
     sourceList.append(wkl)
 
-    #nyt=buildNYT()
-    #sourceList.append(nyt)
-
     fox=buildFoxNews()
     sourceList.append(fox)
     
diff --git a/parser.py b/parser.py
index 53b3261..ef90eee 100644
--- a/parser.py
+++ b/parser.py
@@ -225,6 +225,8 @@ def buildBlaze():
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     blz=buildNewsSource2(name, url, h1s, h2s, h3s)
 
+    blz=removeBadStories(blz, None, ['Tomi Lahren'], None)
+
     #The Blaze has dumb, short description fields, so we need to grab
     #the first x characters of actual article text instead
     blz.h1Arr=blazeFixDesc(blz.h1Arr)
@@ -502,10 +504,17 @@ def buildNYT():
     #this will likely need if/else logic
     h1=content
 
-    #This is with a large headline over a and b columns
-    h1=h1.split('story theme-summary banner', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
+    if 'story theme-summary banner' in h1:
+        #This is with a large headline over a and b columns
+        h1=h1.split('story theme-summary banner', 1)[1]
+        h1=h1.split('<a href="', 1)[1]
+        h1=h1.split('"', 1)[0]
+    else:
+        #otherwise, pull the first story from the A column
+        h1=h1.split('<div class="a-column column">', 1)[1]
+        h1=h1.split('<a href="', 1)[1].split('"', 1)[0]
+    h1s=[h1]
+        
 
     #GET SECONDARY HEADLINES
     #This comes from the a column or b column, above the break
@@ -557,7 +566,7 @@ def buildNYT():
         if (h1 not in x) and (x not in h3s):
             h3s.append(x)
 
-    h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     return nyt
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 46723cd..57c8c6a 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -34,23 +34,33 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
         title=title[:-1]
 
         author=''
-        authorTags=['article:author', 'dc.creator']
-        for tag in authorTags:
-            if tag in content:
-                author=content.split(tag+'" content=')[1][1:].split('>')[0]
-                author=author[:-1]
-                break
+        if sourceName!='The Blaze':
+            authorTags=['article:author', 'dc.creator']
+            for tag in authorTags:
+                if tag in content:
+                    author=content.split(tag+'" content=')[1][1:].split('>')[0]
+                    author=author[:-1]
+                    break
+        #handle The Blaze
+        else:
+            if 'class="article-author">' in content:
+                author=content.split('class="article-author">')[1].split('<')[0]
+            elif 'class="article-author" href="' in content:
+                author=content.split('class="article-author" href="')[1]
+                author=author.split('>')[1].split('<')[0].strip()
 
         description=content.split('og:description" content=')[1][1:].split('>')[0]
         if description[-1]=='/':
             description=description[:-1].strip()
         description=description[:-1]
+        #strip out self-references
+        description=description.replace(sourceName, 'our')
 
         a=Article(title, url, img, description, sourceName, author)
         return a
 
     except:
-        print("Article parsing error in buildArticle() for URL: "+url+" in source"+sourceName)
+        print("Article parsing error in buildArticle() for URL: "+url+" in source "+sourceName+'\n')
         return None
 
 
@@ -63,7 +73,7 @@ def buildOutput(newsSourceArr):
     #set the random order for sources
     h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
     #For h2s and h3s, select N random sources (can repeat), then
-    #a non-repetitive random article from within that source
+    #a non-repetitive random article from within 
     h2RandomPairs=[]
     while len(h2RandomPairs) < 6:
         x=random.sample(range(len(newsSourceArr)), 1)[0]
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-15 23:33:56 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-15 23:33:56 -0600
commit	233eb048a9bc2c4b84e1ae6a47de6b088779ee4e (patch)
tree	95681c7f50d434f4b8380f17656135324632c6a6
parent	38483987b2389b92ca06ac1b409f358ecd4fa991 (diff)