summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-15 13:19:30 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-15 13:19:30 -0600
commit4e8f7609f10abbf6122e59f1456c91498f2a1fbd (patch)
treea87375b75e0f3b80ed53eb61cb80a20fdc2ab05d
parent0fa08599612a92a833c00b1d898cabd429d7bb37 (diff)
Added CBS to new parser
-rw-r--r--main.py22
-rw-r--r--parser.py88
2 files changed, 82 insertions, 28 deletions
diff --git a/main.py b/main.py
index c70bb5b..09bfddc 100644
--- a/main.py
+++ b/main.py
@@ -21,26 +21,8 @@ def run():
nbc=buildNBC()
sourceList.append(nbc)
- '''
- sourceList.append(NewsSource('NBC News',
- 'http://nbcnews.com',
- [' top-stories-section', 'panel_hero', '<a href="'],
- ['<div class="story-link', '<a href="'],
- [],
- None, None,
- 'ad-content ad-xs mobilebox1', 'taboola-native-top-stories-thumbnail',
- None, None))
- '''
-
- sourceList.append(NewsSource('CBS News',
- 'http://cbsnews.com',
- ['<h1 class="title">', '<a href="'],
- ['<li data-tb-region-item>', '<a href="'],
- [],
- None, None, #'Big News Area Side Assets', '</a>'
- 'Big News Area Side Assets', '</ul></div>',
- None, None))
-
+ cbs=buildCBS()
+ sourceList.append(cbs)
sourceList.append(NewsSource('The Blaze',
diff --git a/parser.py b/parser.py
index a7ca6ac..1a306cf 100644
--- a/parser.py
+++ b/parser.py
@@ -95,7 +95,7 @@ def removeDuplicates(h1s, h2s, h3s):
-def removeBadStories(source, badDescArr, badAuthorArr):
+def removeBadStories(source, badDescArr, badAuthorArr, badImgArr):
if badAuthorArr!=None:
for h1 in source.h1Arr:
@@ -105,18 +105,18 @@ def removeBadStories(source, badDescArr, badAuthorArr):
#if it's in the h1 slot, bump up the first h2 into the h1 slot
source.h1Arr.append(source.h2Arr[0])
source.h2Arr.remove(source.h2Arr[0])
- print('removed '+h1.title+' from '+source.name)
+ print('removed '+h1.title+' from '+source.name+' Reason: bad author')
for h2 in source.h2Arr:
for item in badAuthorArr:
if item in h2.author:
source.h2Arr.remove(h2)
- print('removed '+h2.title+' from '+source.name)
+ print('removed '+h2.title+' from '+source.name+' Reason: bad author')
for h3 in source.h3Arr:
for item in badAuthorArr:
if item in h3.author:
source.h3Arr.remove(h3)
- print('removed '+h3.title+' from '+source.name)
+ print('removed '+h3.title+' from '+source.name+' Reason: bad author')
if badDescArr!=None:
for h1 in source.h1Arr:
@@ -126,22 +126,93 @@ def removeBadStories(source, badDescArr, badAuthorArr):
#if it's in the h1 slot, bump up the first h2 into the h1 slot
source.h1Arr.append(source.h2Arr[0])
source.h2Arr.remove(source.h2Arr[0])
- print('removed '+h1.title+' from '+source.name)
+ print('removed '+h1.title+' from '+source.name+' Reason: bad description')
for h2 in source.h2Arr:
for item in badDescArr:
if item in h2.description:
source.h2Arr.remove(h2)
- print('removed '+h2.title+' from '+source.name)
+ print('removed '+h2.title+' from '+source.name+' Reason: bad description')
for h3 in source.h3Arr:
for item in badDescArr:
if item in h3.description:
source.h3Arr.remove(h3)
- print('removed '+h3.title+' from '+source.name)
+ print('removed '+h3.title+' from '+source.name+' Reason: bad description')
+
+ if badImgArr!=None:
+ for h1 in source.h1Arr:
+ for item in badImgArr:
+ if item in h1.img:
+ source.h1Arr.remove(h1)
+ #if it's in the h1 slot, bump up the first h2 into the h1 slot
+ source.h1Arr.append(source.h2Arr[0])
+ source.h2Arr.remove(source.h2Arr[0])
+ print('removed '+h1.title+' from '+source.name+' Reason: bad image')
+
+ for h2 in source.h2Arr:
+ for item in badImgArr:
+ if item in h2.img:
+ source.h2Arr.remove(h2)
+ print('removed '+h2.title+' from '+source.name+' Reason: bad image')
+
+ for h3 in source.h3Arr:
+ for item in badImgArr:
+ if item in h3.img:
+ source.h3Arr.remove(h3)
+ print('removed '+h3.title+' from '+source.name+' Reason: bad image')
return source
+
+
+def buildCBS():
+ url='http://cbsnews.com'
+ name='CBS News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1 class="title">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('Big News Area Side Assets', 1)[1]
+ h2=h2.split('</ul></div>', 1)[0]
+ while '<li data-tb-region-item>' in h2:
+ h2=h2.split('<li data-tb-region-item>', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('Latest News', 1)[1]
+ #this story section goes on forever; just grab the first 5
+ while len(h3s)<5:
+ h3=h3.split('<li class="item-full-lead"', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return cbs
+
+
+
+
+
def buildNBC():
url='http://nbcnews.com'
name='NBC News'
@@ -301,7 +372,8 @@ def buildWeeklyStandard():
#REMOVE BAD STORIES
badDescArr=['Matt Labash']
badAuthorArr=['MATT LABASH']
- wkl=removeBadStories(wkl, badDescArr, badAuthorArr)
+ badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
+ wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr)
return wkl