diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 23:33:56 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 23:33:56 -0600 |
commit | 233eb048a9bc2c4b84e1ae6a47de6b088779ee4e (patch) | |
tree | 95681c7f50d434f4b8380f17656135324632c6a6 | |
parent | 38483987b2389b92ca06ac1b409f358ecd4fa991 (diff) |
Fixed NYT, plus other parsing fixes and a minor visual tweak
-rw-r--r-- | html_template/unbiased.css | 2 | ||||
-rw-r--r-- | main.py | 7 | ||||
-rw-r--r-- | parser.py | 19 | ||||
-rw-r--r-- | unbiasedFunctions.py | 26 |
4 files changed, 37 insertions, 17 deletions
diff --git a/html_template/unbiased.css b/html_template/unbiased.css index 86f653a..90c604a 100644 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -69,7 +69,7 @@ a:link, a:visited, a:hover, a:active { vertical-align:top;
text-align:left;
width:360px;
- height:322px;
+ height:352px;
overflow:hidden;
background:#fff;
margin:10px;
@@ -15,6 +15,10 @@ def main(): def run(): sourceList=[] + + nyt=buildNYT() + sourceList.append(nyt) + blz=buildBlaze() sourceList.append(blz) @@ -30,9 +34,6 @@ def run(): wkl=buildWeeklyStandard() sourceList.append(wkl) - #nyt=buildNYT() - #sourceList.append(nyt) - fox=buildFoxNews() sourceList.append(fox) @@ -225,6 +225,8 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=removeBadStories(blz, None, ['Tomi Lahren'], None) + #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead blz.h1Arr=blazeFixDesc(blz.h1Arr) @@ -502,10 +504,17 @@ def buildNYT(): #this will likely need if/else logic h1=content - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('<a href="', 1)[1] - h1=h1.split('"', 1)[0] + if 'story theme-summary banner' in h1: + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + else: + #otherwise, pull the first story from the A column + h1=h1.split('<div class="a-column column">', 1)[1] + h1=h1.split('<a href="', 1)[1].split('"', 1)[0] + h1s=[h1] + #GET SECONDARY HEADLINES #This comes from the a column or b column, above the break @@ -557,7 +566,7 @@ def buildNYT(): if (h1 not in x) and (x not in h3s): h3s.append(x) - h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) nyt=buildNewsSource2(name, url, h1s, h2s, h3s) return nyt diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index 46723cd..57c8c6a 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -34,23 +34,33 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im title=title[:-1]
author=''
- authorTags=['article:author', 'dc.creator']
- for tag in authorTags:
- if tag in content:
- author=content.split(tag+'" content=')[1][1:].split('>')[0]
- author=author[:-1]
- break
+ if sourceName!='The Blaze':
+ authorTags=['article:author', 'dc.creator']
+ for tag in authorTags:
+ if tag in content:
+ author=content.split(tag+'" content=')[1][1:].split('>')[0]
+ author=author[:-1]
+ break
+ #handle The Blaze
+ else:
+ if 'class="article-author">' in content:
+ author=content.split('class="article-author">')[1].split('<')[0]
+ elif 'class="article-author" href="' in content:
+ author=content.split('class="article-author" href="')[1]
+ author=author.split('>')[1].split('<')[0].strip()
description=content.split('og:description" content=')[1][1:].split('>')[0]
if description[-1]=='/':
description=description[:-1].strip()
description=description[:-1]
+ #strip out self-references
+ description=description.replace(sourceName, 'our')
a=Article(title, url, img, description, sourceName, author)
return a
except:
- print("Article parsing error in buildArticle() for URL: "+url+" in source"+sourceName)
+ print("Article parsing error in buildArticle() for URL: "+url+" in source "+sourceName+'\n')
return None
@@ -63,7 +73,7 @@ def buildOutput(newsSourceArr): #set the random order for sources
h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
#For h2s and h3s, select N random sources (can repeat), then
- #a non-repetitive random article from within that source
+ #a non-repetitive random article from within
h2RandomPairs=[]
while len(h2RandomPairs) < 6:
x=random.sample(range(len(newsSourceArr)), 1)[0]
|