summaryrefslogtreecommitdiff
path: root/parser.py
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-15 12:45:43 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-15 12:45:43 -0600
commit0fa08599612a92a833c00b1d898cabd429d7bb37 (patch)
tree4795eacb306b91fcce683c683aab3259f80e3268 /parser.py
parent787c1e32630ce72f3d4814615b31005ddb66b650 (diff)
Added NBC to new parser
Diffstat (limited to 'parser.py')
-rw-r--r--parser.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/parser.py b/parser.py
index 7e348e1..a7ca6ac 100644
--- a/parser.py
+++ b/parser.py
@@ -142,6 +142,54 @@ def removeBadStories(source, badDescArr, badAuthorArr):
return source
+def buildNBC():
+ url='http://nbcnews.com'
+ name='NBC News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('top-stories-section', 1)[1]
+ h1=h1.split('panel_hero', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('ad-content ad-xs mobilebox1', 1)[1]
+ h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0]
+ while '<div class="story-link' in h2:
+ h2=h2.split('<div class="story-link', 1)[1]
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('js-more-topstories', 1)[1]
+ h3=h3.split('<div class="panel-section', 1)[0]
+ while '<div class="story-link' in h3:
+ h3=h3.split('<div class="story-link', 1)[1]
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(url+x)
+
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return nbc
+
+
+
+
def buildBBC():
url='http://www.bbc.com/news/world/us_and_canada'
name='BBC US & Canada'