From 0854c3c73d38e75f8e30363f9a05b87a12c5290d Mon Sep 17 00:00:00 2001
From: Matt Singleton <matt@xcolour.net>
Date: Tue, 12 Sep 2017 23:03:26 -0400
Subject: update url with host in base parser

---
 unbiased/sources/base.py      | 6 ++++--
 unbiased/sources/guardian.py  | 1 -
 unbiased/sources/thehill.py   | 6 ++----
 unbiased/sources/washtimes.py | 7 ++-----
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
index af0a53e..e91e5a8 100644
--- a/unbiased/sources/base.py
+++ b/unbiased/sources/base.py
@@ -68,13 +68,15 @@ class NewsSource(object):
         return BeautifulSoup(content, 'lxml')
 
     @classmethod
-    def _normalize_url(cls, url, scheme='http'):
+    def _normalize_url(cls, url):
         """
         Make sure they have a scheme.
+        Make sure they have a host.
         Trim any query string, params, or fragments.
         """
+        cls_url = urllib.parse.urlparse(cls.url)
         url = urllib.parse.urlparse(url)
-        url = (url.scheme or scheme, url.netloc, url.path, '', '', '')
+        url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '')
         return urllib.parse.urlunparse(url)
 
     @classmethod
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
index dff098b..5a1c3dd 100644
--- a/unbiased/sources/guardian.py
+++ b/unbiased/sources/guardian.py
@@ -1,4 +1,3 @@
-import urllib
 import html
 
 from unbiased.sources.base import NewsSource
diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py
index c678261..862204e 100644
--- a/unbiased/sources/thehill.py
+++ b/unbiased/sources/thehill.py
@@ -1,4 +1,3 @@
-import urllib
 
 from unbiased.sources.base import NewsSource
 
@@ -17,16 +16,15 @@ class TheHill(NewsSource):
 
         h1 = soup.find('h1', class_='top-story-headline')\
             .find('a')['href']
-        h1 = urllib.parse.urljoin(cls.url, h1)
         h1s = (h1,)
 
         h23s = soup.find('div', class_='section-top-content')\
                   .find_all('div', class_='top-story-item')
         h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']])
-        h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)
+        h2s = tuple(h2s)
 
         h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']])
-        h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)
+        h3s = tuple(h3s)
 
         return h1s, h2s, h3s
 
diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py
index e344af6..1be1838 100644
--- a/unbiased/sources/washtimes.py
+++ b/unbiased/sources/washtimes.py
@@ -1,5 +1,3 @@
-import urllib
-
 from unbiased.sources.base import NewsSource
 
 class TheWashingtonTimes(NewsSource):
@@ -15,7 +13,6 @@ class TheWashingtonTimes(NewsSource):
         h1 = soup.find('article', class_='lead-story')\
                 .find(class_='article-headline')\
                 .a['href']
-        h1 = urllib.parse.urljoin(cls.url, h1)
         h1s = (h1,)
 
         top_articles = soup.find('section', class_='top-news')\
@@ -24,11 +21,11 @@ class TheWashingtonTimes(NewsSource):
         for a in top_articles:
             if a.attrs.get('class') is None:
                 h2s.append(a.a['href'])
-        h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)
+        h2s = tuple(h2s)
 
         h3s = soup.find('section', class_='more-from desktop-only')\
                 .ul.find_all('a')
         h3s = [x['href'] for x in h3s]
-        h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)
+        h3s = tuple(h3s)
 
         return h1s, h2s, h3s
-- 
cgit v1.2.3