From 4e8f7609f10abbf6122e59f1456c91498f2a1fbd Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Wed, 15 Feb 2017 13:19:30 -0600 Subject: Added CBS to new parser --- main.py | 22 ++-------------- parser.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/main.py b/main.py index c70bb5b..09bfddc 100644 --- a/main.py +++ b/main.py @@ -21,26 +21,8 @@ def run(): nbc=buildNBC() sourceList.append(nbc) - ''' - sourceList.append(NewsSource('NBC News', - 'http://nbcnews.com', - [' top-stories-section', 'panel_hero', '', '' - 'Big News Area Side Assets', '', - None, None)) - + cbs=buildCBS() + sourceList.append(cbs) sourceList.append(NewsSource('The Blaze', diff --git a/parser.py b/parser.py index a7ca6ac..1a306cf 100644 --- a/parser.py +++ b/parser.py @@ -95,7 +95,7 @@ def removeDuplicates(h1s, h2s, h3s): -def removeBadStories(source, badDescArr, badAuthorArr): +def removeBadStories(source, badDescArr, badAuthorArr, badImgArr): if badAuthorArr!=None: for h1 in source.h1Arr: @@ -105,18 +105,18 @@ def removeBadStories(source, badDescArr, badAuthorArr): #if it's in the h1 slot, bump up the first h2 into the h1 slot source.h1Arr.append(source.h2Arr[0]) source.h2Arr.remove(source.h2Arr[0]) - print('removed '+h1.title+' from '+source.name) + print('removed '+h1.title+' from '+source.name+' Reason: bad author') for h2 in source.h2Arr: for item in badAuthorArr: if item in h2.author: source.h2Arr.remove(h2) - print('removed '+h2.title+' from '+source.name) + print('removed '+h2.title+' from '+source.name+' Reason: bad author') for h3 in source.h3Arr: for item in badAuthorArr: if item in h3.author: source.h3Arr.remove(h3) - print('removed '+h3.title+' from '+source.name) + print('removed '+h3.title+' from '+source.name+' Reason: bad author') if badDescArr!=None: for h1 in source.h1Arr: @@ -126,22 +126,93 @@ def removeBadStories(source, badDescArr, badAuthorArr): #if it's in the h1 slot, bump up the first h2 into the h1 slot source.h1Arr.append(source.h2Arr[0]) source.h2Arr.remove(source.h2Arr[0]) - print('removed '+h1.title+' from '+source.name) + print('removed '+h1.title+' from '+source.name+' Reason: bad description') for h2 in source.h2Arr: for item in badDescArr: if item in h2.description: source.h2Arr.remove(h2) - print('removed '+h2.title+' from '+source.name) + print('removed '+h2.title+' from '+source.name+' Reason: bad description') for h3 in source.h3Arr: for item in badDescArr: if item in h3.description: source.h3Arr.remove(h3) - print('removed '+h3.title+' from '+source.name) + print('removed '+h3.title+' from '+source.name+' Reason: bad description') + + if badImgArr!=None: + for h1 in source.h1Arr: + for item in badImgArr: + if item in h1.img: + source.h1Arr.remove(h1) + #if it's in the h1 slot, bump up the first h2 into the h1 slot + source.h1Arr.append(source.h2Arr[0]) + source.h2Arr.remove(source.h2Arr[0]) + print('removed '+h1.title+' from '+source.name+' Reason: bad image') + + for h2 in source.h2Arr: + for item in badImgArr: + if item in h2.img: + source.h2Arr.remove(h2) + print('removed '+h2.title+' from '+source.name+' Reason: bad image') + + for h3 in source.h3Arr: + for item in badImgArr: + if item in h3.img: + source.h3Arr.remove(h3) + print('removed '+h3.title+' from '+source.name+' Reason: bad image') return source + + +def buildCBS(): + url='http://cbsnews.com' + name='CBS News' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('

', 1)[1] + h1=h1.split('', 1)[0] + while '
  • ' in h2: + h2=h2.split('
  • ', 1)[1] + h2=h2.split('