summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorssstvinc2 <sstvinc2@gmail.com>2017-03-06 18:12:59 -0500
committerssstvinc2 <sstvinc2@gmail.com>2017-03-06 18:12:59 -0500
commitf8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (patch)
treeb82e1fc2034ab454a9e3bb38cb08d0a80dbeec17
parent0ce27f6e13a139c2fe06082dfb10a35d213fc7a7 (diff)
Added spotCheck ability. Other minor tweaks
-rwxr-xr-xmain.py7
-rwxr-xr-xparser.py24
-rwxr-xr-xspotCheck.py38
-rw-r--r--unbiasedFunctions.py4
4 files changed, 62 insertions, 11 deletions
diff --git a/main.py b/main.py
index 5f9830f..23dcb5f 100755
--- a/main.py
+++ b/main.py
@@ -26,14 +26,15 @@ def run():
hil=buildTheHill()
sourceList.append(hil)
- nyt=buildNYT()
- sourceList.append(nyt)
+ #nyt=buildNYT()
+ #sourceList.append(nyt)
npr=buildNPR()
sourceList.append(npr)
#for some reason, The Guardian sometimes just doesn't work right?
#loop until it gets it right
+ '''
h1='https://www.theguardian.com/us'
looped=False
while h1=='https://www.theguardian.com/us':
@@ -43,6 +44,8 @@ def run():
except:
print('The Guardian: build error. Looping again.')
looped=True
+ '''
+ gdn=buildGuardian()
sourceList.append(gdn)
blz=buildBlaze()
diff --git a/parser.py b/parser.py
index 0426df1..be40a3b 100755
--- a/parser.py
+++ b/parser.py
@@ -226,7 +226,7 @@ def buildTheHill():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
hil=buildNewsSource2(name, url, h1s, h2s, h3s)
- hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp'], None, None)
+ hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams'], None, None)
return hil
@@ -235,7 +235,7 @@ def buildTheHill():
def buildGuardian():
- url='http://www.theguardian.com/us'
+ url='http://www.theguardian.com/us-news'
name='The Guardian'
#DOWNLOAD HOMEPAGE CONTENT
@@ -253,7 +253,7 @@ def buildGuardian():
h2s=[]
#only the h1 and the two h2s have this, so split on it and grab
#the second two
- h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:]
+ h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
for x in h2:
x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
x=x.split('"', 1)[0]
@@ -273,7 +273,7 @@ def buildGuardian():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
- gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
+ gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
@@ -339,7 +339,7 @@ def buildBlaze():
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+ blz=removeBadStories(blz, None, ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
@@ -401,7 +401,7 @@ def buildCBS():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
- cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, None)
+ cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, ['whats-in-the-news-coverart'])
return cbs
@@ -455,10 +455,12 @@ def buildNBC():
h3s.append(x)
#adjust for today.com urls
+ '''
for arr in [h1s, h2s, h3s]:
for i in range(len(arr)):
if 'today.com' in arr[i]:
arr[i]=arr[i].split('.com', 1)[1]
+ '''
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
@@ -510,7 +512,13 @@ def buildBBC():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+ badTitleArr=None
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=['bbc_news_logo.png']
+ bbc=removeBadStories(bbc, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
#REMOVE ' - BBC News' from headlines
for i in range(len(bbc.h1Arr)):
if ' - BBC News' in bbc.h1Arr[i].title:
@@ -633,7 +641,7 @@ def buildNPR():
npr=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=None
+ badTitleArr=['The Two-Way']
badDescArr=None
badAuthorArr=None
badImgArr=None
@@ -683,7 +691,7 @@ def buildFoxNews():
fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=['O&#039;Reilly']
+ badTitleArr=['O&#039;Reilly', 'Fox News']
badDescArr=None
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
diff --git a/spotCheck.py b/spotCheck.py
new file mode 100755
index 0000000..5c0e54d
--- /dev/null
+++ b/spotCheck.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+
+from parser import *
+from unbiasedObjects import *
+import sys
+
+def spotCheck(src):
+
+ fns = {'hil' : buildTheHill,
+ 'cbs' : buildCBS,
+ 'npr' : buildNPR,
+ 'fox' : buildFoxNews,
+ 'gdn' : buildGuardian,
+ 'blz' : buildBlaze,
+ 'bbc' : buildBBC,
+ 'nbc' : buildNBC}
+
+ data=fns[src]()
+
+ print('H1s:\n--------------')
+ for h in data.h1Arr:
+ print(h.title)
+
+ print('\n\nH2s:\n--------------')
+ for h in data.h2Arr:
+ print(h.title)
+
+ print('\n\nH3s:\n--------------')
+ for h in data.h3Arr:
+ print(h.title)
+
+ print('\n\n')
+
+
+
+if __name__=='__main__':
+ spotCheck(sys.argv[1])
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 444428f..4d2019c 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -96,6 +96,8 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
print("SHOULDN'T GET HERE")
#strip out self-references
+ description=description.replace(sourceName+"'s", 'our')
+ description=description.replace(sourceName+"'", 'our')
description=description.replace(sourceName, 'our')
if debugging:
@@ -141,7 +143,7 @@ def buildOutput(newsSourceArr):
if not pair in h3RandomPairs:
h3RandomPairs.append(pair)
else:
- continue
+ print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
#replace html template locations with data from newsSourceArr
for i in range(len(h1RandomSources)):