summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--unbiased/sources/base.py6
-rw-r--r--unbiased/sources/guardian.py1
-rw-r--r--unbiased/sources/thehill.py6
-rw-r--r--unbiased/sources/washtimes.py7
4 files changed, 8 insertions, 12 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
index af0a53e..e91e5a8 100644
--- a/unbiased/sources/base.py
+++ b/unbiased/sources/base.py
@@ -68,13 +68,15 @@ class NewsSource(object):
return BeautifulSoup(content, 'lxml')
@classmethod
- def _normalize_url(cls, url, scheme='http'):
+ def _normalize_url(cls, url):
"""
Make sure they have a scheme.
+ Make sure they have a host.
Trim any query string, params, or fragments.
"""
+ cls_url = urllib.parse.urlparse(cls.url)
url = urllib.parse.urlparse(url)
- url = (url.scheme or scheme, url.netloc, url.path, '', '', '')
+ url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '')
return urllib.parse.urlunparse(url)
@classmethod
diff --git a/unbiased/sources/guardian.py b/unbiased/sources/guardian.py
index dff098b..5a1c3dd 100644
--- a/unbiased/sources/guardian.py
+++ b/unbiased/sources/guardian.py
@@ -1,4 +1,3 @@
-import urllib
import html
from unbiased.sources.base import NewsSource
diff --git a/unbiased/sources/thehill.py b/unbiased/sources/thehill.py
index c678261..862204e 100644
--- a/unbiased/sources/thehill.py
+++ b/unbiased/sources/thehill.py
@@ -1,4 +1,3 @@
-import urllib
from unbiased.sources.base import NewsSource
@@ -17,16 +16,15 @@ class TheHill(NewsSource):
h1 = soup.find('h1', class_='top-story-headline')\
.find('a')['href']
- h1 = urllib.parse.urljoin(cls.url, h1)
h1s = (h1,)
h23s = soup.find('div', class_='section-top-content')\
.find_all('div', class_='top-story-item')
h2s = set([x.h4.a['href'] for x in h23s if 'small' not in x['class']])
- h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)
+ h2s = tuple(h2s)
h3s = set([x.h4.a['href'] for x in h23s if 'small' in x['class']])
- h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)
+ h3s = tuple(h3s)
return h1s, h2s, h3s
diff --git a/unbiased/sources/washtimes.py b/unbiased/sources/washtimes.py
index e344af6..1be1838 100644
--- a/unbiased/sources/washtimes.py
+++ b/unbiased/sources/washtimes.py
@@ -1,5 +1,3 @@
-import urllib
-
from unbiased.sources.base import NewsSource
class TheWashingtonTimes(NewsSource):
@@ -15,7 +13,6 @@ class TheWashingtonTimes(NewsSource):
h1 = soup.find('article', class_='lead-story')\
.find(class_='article-headline')\
.a['href']
- h1 = urllib.parse.urljoin(cls.url, h1)
h1s = (h1,)
top_articles = soup.find('section', class_='top-news')\
@@ -24,11 +21,11 @@ class TheWashingtonTimes(NewsSource):
for a in top_articles:
if a.attrs.get('class') is None:
h2s.append(a.a['href'])
- h2s = tuple(urllib.parse.urljoin(cls.url, x) for x in h2s)
+ h2s = tuple(h2s)
h3s = soup.find('section', class_='more-from desktop-only')\
.ul.find_all('a')
h3s = [x['href'] for x in h3s]
- h3s = tuple(urllib.parse.urljoin(cls.url, x) for x in h3s)
+ h3s = tuple(h3s)
return h1s, h2s, h3s