diff options
-rw-r--r-- | unbiased/sources/base.py | 6 | ||||
-rw-r--r-- | unbiased/sources/guardian.py | 1 | ||||
-rw-r--r-- | unbiased/sources/thehill.py | 6 | ||||
-rw-r--r-- | unbiased/sources/washtimes.py | 7 |
4 files changed, 8 insertions, 12 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index af0a53e..e91e5a8 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,13 +68,15 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url, scheme='http'): + def _normalize_url(cls, url): """ Make sure they have a scheme. + Make sure they have a host. Trim any query string, params, or fragments. """ + cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or scheme, url.netloc, url.path, '', '', '') + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') return urllib.parse.urlunparse(url) @classmethod diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py index dff098b..5a1c3dd 100644 --- a/unbiased/sources/guardian.py +++ b/unbiased/sources/guardian.py @@ -1,4 +1,3 @@ -import urllib import html from unbiased.sources.base import NewsSource diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py index c678261..862204e 100644 --- a/unbiased/sources/thehill.py +++ b/unbiased/sources/thehill.py @@ -1,4 +1,3 @@ -import urllib from unbiased.sources.base import NewsSource @@ -17,16 +16,15 @@ class TheHill(NewsSource): h1 = soup.find('h1', class_='top-story-headline')\ .find('a')['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) h23s = soup.find('div', class_='section-top-content')\ .find_all('div', class_='top-story-item') h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']]) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']]) - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py index e344af6..1be1838 100644 --- a/unbiased/sources/washtimes.py +++ b/unbiased/sources/washtimes.py @@ -1,5 +1,3 @@ -import urllib - from unbiased.sources.base import NewsSource class TheWashingtonTimes(NewsSource): @@ -15,7 +13,6 @@ class TheWashingtonTimes(NewsSource): h1 = soup.find('article', class_='lead-story')\ .find(class_='article-headline')\ .a['href'] - h1 = urllib.parse.urljoin(cls.url, h1) h1s = (h1,) top_articles = soup.find('section', class_='top-news')\ @@ -24,11 +21,11 @@ class TheWashingtonTimes(NewsSource): for a in top_articles: if a.attrs.get('class') is None: h2s.append(a.a['href']) - h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s) + h2s = tuple(h2s) h3s = soup.find('section', class_='more-from desktop-only')\ .ul.find_all('a') h3s = [x['href'] for x in h3s] - h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s) + h3s = tuple(h3s) return h1s, h2s, h3s |