diff options
author | Matt Singleton <matt@xcolour.net> | 2017-09-11 20:09:41 -0400 |
---|---|---|
committer | Matt Singleton <matt@xcolour.net> | 2017-09-11 20:09:41 -0400 |
commit | 985ce4c540cd437b6e6475fb0e969f2aea0bd901 (patch) | |
tree | 782d5d43cda95fa6cf7d13fa87f5afd8b250d093 | |
parent | d0c7c0541013cc9472b38ccfd614a314e9a86d70 (diff) |
normalize urls using urllib
-rw-r--r-- | unbiased/sources/base.py | 18 |
1 files changed, 8 insertions, 10 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index 9dc14fd..51c4cff 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -51,9 +51,9 @@ class NewsSource(object): @classmethod def build(cls): h1s, h2s, h3s = cls._fetch_urls() - h1s = tuple(cls._fix_url(x) for x in h1s) - h2s = tuple(cls._fix_url(x) for x in h2s) - h3s = tuple(cls._fix_url(x) for x in h3s) + h1s = tuple(cls._normalize_url(x) for x in h1s) + h2s = tuple(cls._normalize_url(x) for x in h2s) + h3s = tuple(cls._normalize_url(x) for x in h3s) h1s, h2s, h3s = cls._remove_duplicates(h1s, h2s, h3s) h1s, h2s, h3s = cls._fetch_articles(h1s, h2s, h3s) h1s, h2s, h3s = cls._remove_all_bad_stories(h1s, h2s, h3s) @@ -69,16 +69,14 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _fix_url(cls, url, scheme='http'): + def _normalize_url(cls, url, scheme='http'): """ Make sure they have a scheme. - Trim any query parameters. + Trim any query string, params, or fragments. """ - # TODO: proper URL parsing - if url.startswith('//'): - url = '{}:{}'.format(scheme, x) - url = url.split('?')[0] - return url + url = urllib.parse.urlparse(url) + url = (url.scheme or scheme, url.netloc, url.path, '', '', '') + return urllib.parse.urlunparse(url) @classmethod def _remove_duplicates(cls, h1s, h2s, h3s): |