summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-15 10:42:53 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-15 10:42:53 -0600
commit787c1e32630ce72f3d4814615b31005ddb66b650 (patch)
tree00ff0a23218f2d2b7a4c1225b71c1b714036a4fb
parent064e200fa57dfeda3776b7a708cc0af8280f20fb (diff)
Added BBC to new parser
-rw-r--r--main.py12
-rw-r--r--parser.py59
-rw-r--r--unbiasedFunctions.py2
3 files changed, 59 insertions, 14 deletions
diff --git a/main.py b/main.py
index 948bf36..7fbcc23 100644
--- a/main.py
+++ b/main.py
@@ -15,16 +15,8 @@ def main():
def run():
sourceList=[]
-
- sourceList.append(NewsSource('BBC US',
- 'http://www.bbc.com/news/world/us_and_canada',
- ['buzzard-item', '<a href="'],
- ['top_stories#', '<a href="'],
- [],
- None, None,
- '<div class="pigeon">','<div id=',
- None, None,
- 'http://www.bbc.com'))
+ bbc=buildBBC()
+ sourceList.append(bbc)
sourceList.append(NewsSource('NBC News',
diff --git a/parser.py b/parser.py
index 0bd5b0f..7e348e1 100644
--- a/parser.py
+++ b/parser.py
@@ -118,7 +118,6 @@ def removeBadStories(source, badDescArr, badAuthorArr):
source.h3Arr.remove(h3)
print('removed '+h3.title+' from '+source.name)
- '''
if badDescArr!=None:
for h1 in source.h1Arr:
for item in badDescArr:
@@ -139,10 +138,64 @@ def removeBadStories(source, badDescArr, badAuthorArr):
if item in h3.description:
source.h3Arr.remove(h3)
print('removed '+h3.title+' from '+source.name)
- '''
return source
-
+
+
+def buildBBC():
+ url='http://www.bbc.com/news/world/us_and_canada'
+ name='BBC US & Canada'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('buzzard-item', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=['http://www.bbc.com'+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="pigeon">', 1)[1]
+ h2=h2.split('<div id=', 1)[0]
+ while 'top_stories#' in h2:
+ h2=h2.split('top_stories#', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append('http://www.bbc.com'+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div class="macaw">', 1)[1]
+ h3=h3.split('Watch/Listen', 1)[0]
+ while '<div class="macaw-item' in h3:
+ h3=h3.split('<div class="macaw-item', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append('http://www.bbc.com'+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE ' - BBC News' from headlines
+ for i in range(len(bbc.h1Arr)):
+ if ' - BBC News' in bbc.h1Arr[i].title:
+ bbc.h1Arr[i].title=bbc.h1Arr[i].title.split(' - BBC News', 1)[0]
+ for i in range(len(bbc.h2Arr)):
+ if ' - BBC News' in bbc.h2Arr[i].title:
+ bbc.h2Arr[i].title=bbc.h2Arr[i].title.split(' - BBC News', 1)[0]
+ for i in range(len(bbc.h3Arr)):
+ if ' - BBC News' in bbc.h3Arr[i].title:
+ bbc.h3Arr[i].title=bbc.h3Arr[i].title.split(' - BBC News', 1)[0]
+
+ return bbc
+
def buildWeeklyStandard():
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index 5f46ed2..733d6ba 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -7,7 +7,7 @@ import time
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
- ''' PRINT DEBUGGING
+ '''#PRINT DEBUGGING
print(sourceName)
print(url)
print()