From a39dfd89cacbdbc06742cb4c72a981844b2ae371 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 13 Jan 2019 08:26:24 -0500 Subject: new nbc layout --- unbiased/sources/nbc.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/unbiased/sources/nbc.py b/unbiased/sources/nbc.py index 9ce131f..402d2aa 100644 --- a/unbiased/sources/nbc.py +++ b/unbiased/sources/nbc.py @@ -1,3 +1,5 @@ +import re + from unbiased.sources.base import NewsSource class NBC(NewsSource): @@ -12,21 +14,16 @@ class NBC(NewsSource): def _fetch_urls(cls): soup = cls._fetch_content(cls.url) - h1s = soup.find('div', class_='js-top-stories-content')\ - .find('div', class_='panel_hero')\ - .a - h1s = (h1s['href'],) - - rows = soup.find('div', class_='js-top-stories-content')\ - .div.find_all('div', class_='row') - h2s = [] - for row in rows: - for fragment in row.find_all('div', class_='media-body'): - h2s.append(fragment.a['href']) - h2s = tuple(h2s) - - links = soup.find('div', class_='js-more-topstories')\ - .div.find_all('div', class_='story-link') - h3s = tuple(x.a['href'] for x in links) + articles = soup.find_all('article', class_='teaseCard') + article_links = [x.find('a', class_=re.compile('pictureLink__.*')) for x in articles] + article_links = [x['href'] for x in article_links if x is not None] + + h1s = tuple(article_links[:3]) + h2s = tuple(article_links[3:]) + + pancake_headlines = soup.find('section', class_='pancake')\ + .find_all('h3') + + h3s = tuple([x.find('a')['href'] for x in pancake_headlines]) return h1s, h2s, h3s -- cgit v1.2.3