Added NBC to new parser

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-15 12:45:43 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-15 12:45:43 -0600
commit: 0fa08599612a92a833c00b1d898cabd429d7bb37 (patch)
tree: 4795eacb306b91fcce683c683aab3259f80e3268
parent: 787c1e32630ce72f3d4814615b31005ddb66b650 (diff)
2 files changed, 54 insertions, 3 deletions
diff --git a/main.py b/main.py
index 7fbcc23..c70bb5b 100644
--- a/main.py
+++ b/main.py
@@ -18,16 +18,19 @@ def run():
     bbc=buildBBC()
     sourceList.append(bbc)
 
-    
+    nbc=buildNBC()
+    sourceList.append(nbc)
+
+    '''
     sourceList.append(NewsSource('NBC News',
                                  'http://nbcnews.com',
-                                 ['top-stories-section', 'panel_hero', '<a href="'],
+                                 [' top-stories-section', 'panel_hero', '<a href="'],
                                  ['<div class="story-link', '<a href="'],
                                  [],
                                  None, None,
                                  'ad-content ad-xs mobilebox1', 'taboola-native-top-stories-thumbnail',
                                  None, None))
-
+    '''
 
     sourceList.append(NewsSource('CBS News',
                                  'http://cbsnews.com',
diff --git a/parser.py b/parser.py
index 7e348e1..a7ca6ac 100644
--- a/parser.py
+++ b/parser.py
@@ -142,6 +142,54 @@ def removeBadStories(source, badDescArr, badAuthorArr):
     return source
 
 
+def buildNBC():    
+    url='http://nbcnews.com'
+    name='NBC News'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+
+    #get main headline
+    h1=content
+    h1=h1.split('top-stories-section', 1)[1]
+    h1=h1.split('panel_hero', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[url+h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('ad-content ad-xs mobilebox1', 1)[1]
+    h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0]
+    while '<div class="story-link' in h2:
+        h2=h2.split('<div class="story-link', 1)[1]
+        h2=h2.split('<a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(url+x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('js-more-topstories', 1)[1]
+    h3=h3.split('<div class="panel-section', 1)[0]
+    while '<div class="story-link' in h3:
+        h3=h3.split('<div class="story-link', 1)[1]
+        h3=h3.split('<a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if h1 not in x:
+            h3s.append(url+x)
+
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+    nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    return nbc
+
+
+
+
 def buildBBC():    
     url='http://www.bbc.com/news/world/us_and_canada'
     name='BBC US & Canada'
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-15 12:45:43 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-15 12:45:43 -0600
commit	0fa08599612a92a833c00b1d898cabd429d7bb37 (patch)
tree	4795eacb306b91fcce683c683aab3259f80e3268
parent	787c1e32630ce72f3d4814615b31005ddb66b650 (diff)