summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorssstvinc2 <sstvinc2@gmail.com>2017-03-09 09:38:57 -0500
committerssstvinc2 <sstvinc2@gmail.com>2017-03-09 09:38:57 -0500
commita596eb07939b8e653355be0020f0a7d8c0ee2d80 (patch)
tree5c36379ffc2411e3aab56a8a64edd2b8a16c9fef
parentf8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (diff)
Reworked Guardian
-rwxr-xr-xmain.py18
-rwxr-xr-xparser.py32
-rw-r--r--unbiasedFunctions.py15
3 files changed, 38 insertions, 27 deletions
diff --git a/main.py b/main.py
index 23dcb5f..182ae26 100755
--- a/main.py
+++ b/main.py
@@ -23,15 +23,6 @@ def run():
'''
- hil=buildTheHill()
- sourceList.append(hil)
-
- #nyt=buildNYT()
- #sourceList.append(nyt)
-
- npr=buildNPR()
- sourceList.append(npr)
-
#for some reason, The Guardian sometimes just doesn't work right?
#loop until it gets it right
'''
@@ -48,6 +39,15 @@ def run():
gdn=buildGuardian()
sourceList.append(gdn)
+ hil=buildTheHill()
+ sourceList.append(hil)
+
+ #nyt=buildNYT()
+ #sourceList.append(nyt)
+
+ npr=buildNPR()
+ sourceList.append(npr)
+
blz=buildBlaze()
sourceList.append(blz)
diff --git a/parser.py b/parser.py
index be40a3b..a54f033 100755
--- a/parser.py
+++ b/parser.py
@@ -10,12 +10,15 @@ import re
Takes in a URL, downloads the file to a temp file,
reads the file into a string, and returns that string
'''
-def urlToContent(url):
+def urlToContent(url, sourceEncoding='utf8'):
#download file
os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
#read file
- f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+ if sourceEncoding=='utf8':
+ f=open('scratch/temp1.html', 'r', encoding="utf8")
+ else:
+ f=open('scratch/temp1.html', 'r', encoding="latin-1")
content=f.read()
f.close()
@@ -235,12 +238,12 @@ def buildTheHill():
def buildGuardian():
- url='http://www.theguardian.com/us-news'
- name='The Guardian'
+ url='http://www.theguardian.com/us'
+ name='The Guardian US'
#DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url)
-
+ content=urlToContent(url, 'utf8')
+
#get main headline
h1=content
h1=h1.split('<h1', 1)[1]
@@ -255,15 +258,18 @@ def buildGuardian():
#the second two
h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
for x in h2:
- x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
- x=x.split('"', 1)[0]
- h2s.append(x)
+ if '<h2 class="fc-item__title"><a href="' in x:
+ x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(x)
+ else:
+ break
#GET TERTIARY HEADLINES
h3=content
h3s=[]
h3=h3.split('<div class="fc-slice-wrapper">', 1)[1]
- h3=h3.split('<div class="js-show-more-placeholder">', 1)[0]
+ h3=h3.split('<div class="fc-container__inner">', 1)[0]#'<div class="js-show-more-placeholder">', 1)[0]
#this story section goes on forever; just grab the first 5
while '<h2 class="fc-item__title"><a href="' in h3:
h3=h3.split('<h2 class="fc-item__title"><a href="', 1)[1]
@@ -271,7 +277,7 @@ def buildGuardian():
h3s.append(x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
+
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
@@ -339,7 +345,7 @@ def buildBlaze():
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, None, ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+ blz=removeBadStories(blz, ['Tucker Carlson', 'Mark Levin'], ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
@@ -645,7 +651,7 @@ def buildNPR():
badDescArr=None
badAuthorArr=None
badImgArr=None
- #npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+ npr=removeBadStories(npr, badTitleArr, badDescArr, badAuthorArr, badImgArr)
return npr
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 4d2019c..950e16d 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -6,7 +6,7 @@ import re
#take in a url and delimiters, return twitter card
-def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
debugging=False
if debugging:
@@ -124,15 +124,20 @@ def buildOutput(newsSourceArr):
#set the random order for sources
h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
+
#For h2s and h3s, select N random sources (can repeat), then
#a non-repetitive random article from within
h2RandomPairs=[]
while len(h2RandomPairs) < 6:
x=random.sample(range(len(newsSourceArr)), 1)[0]
- y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
- pair=[x,y]
- if not pair in h2RandomPairs:
- h2RandomPairs.append(pair)
+ if len(newsSourceArr[x].h2Arr) > 0:
+ y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0]
+ pair=[x,y]
+ if not pair in h2RandomPairs:
+ h2RandomPairs.append(pair)
+ else:
+ print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
+
h3RandomPairs=[]
while len(h3RandomPairs) < 12:
x=random.sample(range(len(newsSourceArr)), 1)[0]