Added spotCheck ability. Other minor tweaks

author: ssstvinc2 <sstvinc2@gmail.com> 2017-03-06 18:12:59 -0500
committer: ssstvinc2 <sstvinc2@gmail.com> 2017-03-06 18:12:59 -0500
commit: f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (patch)
tree: b82e1fc2034ab454a9e3bb38cb08d0a80dbeec17
parent: 0ce27f6e13a139c2fe06082dfb10a35d213fc7a7 (diff)
4 files changed, 62 insertions, 11 deletions
diff --git a/main.py b/main.py
index 5f9830f..23dcb5f 100755
--- a/main.py
+++ b/main.py
@@ -26,14 +26,15 @@ def run():
     hil=buildTheHill()
     sourceList.append(hil)
 
-    nyt=buildNYT()
-    sourceList.append(nyt)
+    #nyt=buildNYT()
+    #sourceList.append(nyt)
 
     npr=buildNPR()
     sourceList.append(npr)
 
     #for some reason, The Guardian sometimes just doesn't work right?
     #loop until it gets it right
+    '''
     h1='https://www.theguardian.com/us'
     looped=False
     while h1=='https://www.theguardian.com/us':
@@ -43,6 +44,8 @@ def run():
         except:
             print('The Guardian: build error. Looping again.')
         looped=True
+    '''
+    gdn=buildGuardian()
     sourceList.append(gdn)
 
     blz=buildBlaze()
diff --git a/parser.py b/parser.py
index 0426df1..be40a3b 100755
--- a/parser.py
+++ b/parser.py
@@ -226,7 +226,7 @@ def buildTheHill():
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     hil=buildNewsSource2(name, url, h1s, h2s, h3s)
-    hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp'], None, None)
+    hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams'], None, None)
 
     return hil
 
@@ -235,7 +235,7 @@ def buildTheHill():
 
 
 def buildGuardian():
-    url='http://www.theguardian.com/us'
+    url='http://www.theguardian.com/us-news'
     name='The Guardian'
 
     #DOWNLOAD HOMEPAGE CONTENT
@@ -253,7 +253,7 @@ def buildGuardian():
     h2s=[]
     #only the h1 and the two h2s have this, so split on it and grab
     #the second two
-    h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:]
+    h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
     for x in h2:
         x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
         x=x.split('"', 1)[0]
@@ -273,7 +273,7 @@ def buildGuardian():
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     
     gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
-    gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
+    gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
 
     return gdn
 
@@ -339,7 +339,7 @@ def buildBlaze():
 
 
     blz=buildNewsSource2(name, url, h1s, h2s, h3s)
-    blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+    blz=removeBadStories(blz, None, ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
 
     #The Blaze has dumb, short description fields, so we need to grab
     #the first x characters of actual article text instead
@@ -401,7 +401,7 @@ def buildCBS():
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
-    cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, None)
+    cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, ['whats-in-the-news-coverart'])
 
     return cbs
 
@@ -455,10 +455,12 @@ def buildNBC():
             h3s.append(x)
 
     #adjust for today.com urls
+    '''
     for arr in [h1s, h2s, h3s]:
         for i in range(len(arr)):
             if 'today.com' in arr[i]:
                 arr[i]=arr[i].split('.com', 1)[1]
+    '''
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
@@ -510,7 +512,13 @@ def buildBBC():
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
     bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+    badTitleArr=None
+    badDescArr=None
+    badAuthorArr=None
+    badImgArr=['bbc_news_logo.png']
+    bbc=removeBadStories(bbc, badTitleArr, badDescArr, badAuthorArr, badImgArr)
 
+    
     #REMOVE ' - BBC News' from headlines
     for i in range(len(bbc.h1Arr)):
         if ' - BBC News' in bbc.h1Arr[i].title:
@@ -633,7 +641,7 @@ def buildNPR():
     npr=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
-    badTitleArr=None
+    badTitleArr=['The Two-Way']
     badDescArr=None
     badAuthorArr=None
     badImgArr=None
@@ -683,7 +691,7 @@ def buildFoxNews():
     fox=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
-    badTitleArr=['O&#039;Reilly']
+    badTitleArr=['O&#039;Reilly', 'Fox News']
     badDescArr=None
     badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
     badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
diff --git a/spotCheck.py b/spotCheck.py
new file mode 100755
index 0000000..5c0e54d
--- /dev/null
+++ b/spotCheck.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+
+from parser import *
+from unbiasedObjects import *
+import sys
+
+def spotCheck(src):
+
+    fns = {'hil' : buildTheHill,
+           'cbs' : buildCBS,
+           'npr' : buildNPR,
+           'fox' : buildFoxNews,
+           'gdn' : buildGuardian,
+           'blz' : buildBlaze,
+           'bbc' : buildBBC,
+           'nbc' : buildNBC}
+
+    data=fns[src]()
+
+    print('H1s:\n--------------')
+    for h in data.h1Arr:
+        print(h.title)
+
+    print('\n\nH2s:\n--------------')
+    for h in data.h2Arr:
+        print(h.title)
+
+    print('\n\nH3s:\n--------------')
+    for h in data.h3Arr:
+        print(h.title)
+
+    print('\n\n')
+
+
+
+if __name__=='__main__':
+    spotCheck(sys.argv[1])
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 444428f..4d2019c 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -96,6 +96,8 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
                 print("SHOULDN'T GET HERE")
 
         #strip out self-references
+        description=description.replace(sourceName+"'s", 'our')
+        description=description.replace(sourceName+"'", 'our')
         description=description.replace(sourceName, 'our')
 
         if debugging:
@@ -141,7 +143,7 @@ def buildOutput(newsSourceArr):
             if not pair in h3RandomPairs:
                 h3RandomPairs.append(pair)
         else:
-            continue
+            print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
 
     #replace html template locations with data from newsSourceArr
     for i in range(len(h1RandomSources)):
author	ssstvinc2 <sstvinc2@gmail.com>	2017-03-06 18:12:59 -0500
committer	ssstvinc2 <sstvinc2@gmail.com>	2017-03-06 18:12:59 -0500
commit	f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (patch)
tree	b82e1fc2034ab454a9e3bb38cb08d0a80dbeec17
parent	0ce27f6e13a139c2fe06082dfb10a35d213fc7a7 (diff)