From 0f2b265ed3c82e674cc714785a9f308be1db9aa1 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Thu, 16 Feb 2017 12:31:25 -0600 Subject: More parsing fixes, more bad article flagging --- parser.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/parser.py b/parser.py index 31c09da..d12b1c2 100644 --- a/parser.py +++ b/parser.py @@ -33,11 +33,15 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h2Arr=[] for x in h2URLs: - h2Arr.append(buildArticle(x, name)) + a=buildArticle(x, name) + if a!=None: + h2Arr.append(a) h3Arr=[] for x in h3URLs: - h3Arr.append(buildArticle(x, name)) + a=buildArticle(x, name) + if a!=None: + h3Arr.append(a) #BUILD THE NEWS SOURCE newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) @@ -107,6 +111,7 @@ def removeBadStories(source, badDescArr, badAuthorArr, badImgArr): source.h1Arr.append(source.h2Arr[0]) source.h2Arr.remove(source.h2Arr[0]) print('removed '+h1.title+' from '+source.name+' Reason: bad author') + for h2 in source.h2Arr: for item in badAuthorArr: if item in h2.author: @@ -223,9 +228,10 @@ def buildBlaze(): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - blz=buildNewsSource2(name, url, h1s, h2s, h3s) - blz=removeBadStories(blz, None, ['Tomi Lahren', 'Dana Loesch'], None) + + blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=removeBadStories(blz, None, ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None) #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead @@ -246,10 +252,20 @@ def buildCBS(): #get main headline h1=content - h1=h1.split('

', 1)[1] - h1=h1.split('' in content: + h1=h1.split('

', 1)[1] + h1=h1.split('', 1)[0] + h1=h1.split('
  • ', 1)[1] + h1=h1.split('