From 0854c3c73d38e75f8e30363f9a05b87a12c5290d Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Tue, 12 Sep 2017 23:03:26 -0400 Subject: update url with host in base parser --- unbiased/sources/base.py | 6 ++++-- unbiased/sources/guardian.py | 1 - unbiased/sources/thehill.py | 6 ++---- unbiased/sources/washtimes.py | 7 ++----- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index af0a53e..e91e5a8 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,13 +68,15 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url, scheme='http'): + def _normalize_url(cls, url): """ Make sure they have a scheme. + Make sure they have a host. Trim any query string, params, or fragments. """ + cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or scheme, url.netloc, url.path, '', '', '') + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') return urllib.parse.urlunparse(url) @classmethod diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py index dff098b..5a1c3dd 100644 --- a/unbiased/sources/guardian.py +++ b/unbiased/sources/guardian.py @@ -1,4 +1,3 @@ -import urllib import html from unbiased.sources.base import NewsSource diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py index c678261..862204e 100644 --- a/unbiased/sources/thehill.py +++ b/unbiased/sources/thehill.py @@ -1,4 +1,3 @@ -import urllib from unbiased.sources.base import NewsSource @@ -17,16 +16,15 @@ class TheHill(NewsSource): h1 = soup.find('h1', class_='top-story-headline')\ .find('a')['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) h23s = soup.find('div', class_='section-top-content')\ .find_all('div', class_='top-story-item') h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py index e344af6..1be1838 100644 --- a/unbiased/sources/washtimes.py +++ b/unbiased/sources/washtimes.py @@ -1,5 +1,3 @@ -import urllib - from unbiased.sources.base import NewsSource class TheWashingtonTimes(NewsSource): @@ -15,7 +13,6 @@ class TheWashingtonTimes(NewsSource): h1 = soup.find('article', class_='lead-story')\ .find(class_='article-headline')\ .a['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) top_articles = soup.find('section', class_='top-news')\ @@ -24,11 +21,11 @@ class TheWashingtonTimes(NewsSource): for a in top_articles: if a.attrs.get('class') is None: h2s.append(a.a['href']) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = soup.find('section', class_='more-from desktop-only')\ .ul.find_all('a') h3s = [x['href'] for x in h3s] - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s -- cgit v1.2.3