diff options
author | Matt Singleton <msingleton@aclu.org> | 2017-09-27 21:17:17 -0400 |
---|---|---|
committer | Matt Singleton <msingleton@aclu.org> | 2017-09-27 21:17:17 -0400 |
commit | e674ae4ca972e2f902dcc96d65fd4e792668b8a2 (patch) | |
tree | d51a0fbad35f622d95af73078c8fb44e5f82d7aa | |
parent | 0854c3c73d38e75f8e30363f9a05b87a12c5290d (diff) |
let _normalize_urls() optionally preserve some query args
-rw-r--r-- | unbiased/sources/base.py | 17 |
1 files changed, 15 insertions, 2 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py index e91e5a8..14d867e 100644 --- a/unbiased/sources/base.py +++ b/unbiased/sources/base.py @@ -68,7 +68,7 @@ class NewsSource(object): return BeautifulSoup(content, 'lxml') @classmethod - def _normalize_url(cls, url): + def _normalize_url(cls, url, keep_query_vars=None): """ Make sure they have a scheme. Make sure they have a host. @@ -76,7 +76,20 @@ class NewsSource(object): """ cls_url = urllib.parse.urlparse(cls.url) url = urllib.parse.urlparse(url) - url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '') + if keep_query_vars is None: + query = '' + else: + query_vars = {} + qs = urllib.parse.parse_qs(url.query) + for v in keep_query_vars: + if v in qs: + query_vars[v] = qs[v] + query_pairs = [] + for k, i in query_vars.items(): + for v in i: + query_pairs.append('{}={}'.format(k, v)) + query = '?'.join(query_pairs) + url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '') return urllib.parse.urlunparse(url) @classmethod |