summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <matt@xcolour.net>2017-09-11 20:09:41 -0400
committerMatt Singleton <matt@xcolour.net>2017-09-11 20:09:41 -0400
commit985ce4c540cd437b6e6475fb0e969f2aea0bd901 (patch)
tree782d5d43cda95fa6cf7d13fa87f5afd8b250d093
parentd0c7c0541013cc9472b38ccfd614a314e9a86d70 (diff)
normalize urls using urllib
-rw-r--r--unbiased/sources/base.py18
1 files changed, 8 insertions, 10 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
index 9dc14fd..51c4cff 100644
--- a/unbiased/sources/base.py
+++ b/unbiased/sources/base.py
@@ -51,9 +51,9 @@ class NewsSource(object):
@classmethod
def build(cls):
h1s, h2s, h3s = cls._fetch_urls()
- h1s = tuple(cls._fix_url(x) for x in h1s)
- h2s = tuple(cls._fix_url(x) for x in h2s)
- h3s = tuple(cls._fix_url(x) for x in h3s)
+ h1s = tuple(cls._normalize_url(x) for x in h1s)
+ h2s = tuple(cls._normalize_url(x) for x in h2s)
+ h3s = tuple(cls._normalize_url(x) for x in h3s)
h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s)
h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s)
h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s)
@@ -69,16 +69,14 @@ class NewsSource(object):
return BeautifulSoup(content, 'lxml')
@classmethod
- def _fix_url(cls, url, scheme='http'):
+ def _normalize_url(cls, url, scheme='http'):
"""
Make sure they have a scheme.
- Trim any query parameters.
+ Trim any query string, params, or fragments.
"""
- # TODO: proper URL parsing
- if url.startswith('//'):
- url = '{}:{}'.format(scheme, x)
- url = url.split('?')[0]
- return url
+ url = urllib.parse.urlparse(url)
+ url = (url.scheme or scheme, url.netloc, url.path, '', '', '')
+ return urllib.parse.urlunparse(url)
@classmethod
def _remove_duplicates(cls, h1s, h2s, h3s):