diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 12:45:43 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-15 12:45:43 -0600 |
commit | 0fa08599612a92a833c00b1d898cabd429d7bb37 (patch) | |
tree | 4795eacb306b91fcce683c683aab3259f80e3268 | |
parent | 787c1e32630ce72f3d4814615b31005ddb66b650 (diff) |
Added NBC to new parser
-rw-r--r-- | main.py | 9 | ||||
-rw-r--r-- | parser.py | 48 |
2 files changed, 54 insertions, 3 deletions
@@ -18,16 +18,19 @@ def run(): bbc=buildBBC() sourceList.append(bbc) - + nbc=buildNBC() + sourceList.append(nbc) + + ''' sourceList.append(NewsSource('NBC News', 'http://nbcnews.com', - ['top-stories-section', 'panel_hero', '<a href="'], + [' top-stories-section', 'panel_hero', '<a href="'], ['<div class="story-link', '<a href="'], [], None, None, 'ad-content ad-xs mobilebox1', 'taboola-native-top-stories-thumbnail', None, None)) - + ''' sourceList.append(NewsSource('CBS News', 'http://cbsnews.com', @@ -142,6 +142,54 @@ def removeBadStories(source, badDescArr, badAuthorArr): return source +def buildNBC(): + url='http://nbcnews.com' + name='NBC News' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('top-stories-section', 1)[1] + h1=h1.split('panel_hero', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[url+h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('ad-content ad-xs mobilebox1', 1)[1] + h2=h2.split('taboola-native-top-stories-thumbnail', 1)[0] + while '<div class="story-link' in h2: + h2=h2.split('<div class="story-link', 1)[1] + h2=h2.split('<a href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(url+x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('js-more-topstories', 1)[1] + h3=h3.split('<div class="panel-section', 1)[0] + while '<div class="story-link' in h3: + h3=h3.split('<div class="story-link', 1)[1] + h3=h3.split('<a href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(url+x) + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + nbc=buildNewsSource2(name, url, h1s, h2s, h3s) + + return nbc + + + + def buildBBC(): url='http://www.bbc.com/news/world/us_and_canada' name='BBC US & Canada' |