summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Singleton <msingleton@aclu.org>2017-09-27 21:17:17 -0400
committerMatt Singleton <msingleton@aclu.org>2017-09-27 21:17:17 -0400
commite674ae4ca972e2f902dcc96d65fd4e792668b8a2 (patch)
treed51a0fbad35f622d95af73078c8fb44e5f82d7aa
parent0854c3c73d38e75f8e30363f9a05b87a12c5290d (diff)
let _normalize_urls() optionally preserve some query args
-rw-r--r--unbiased/sources/base.py17
1 files changed, 15 insertions, 2 deletions
diff --git a/unbiased/sources/base.py b/unbiased/sources/base.py
index e91e5a8..14d867e 100644
--- a/unbiased/sources/base.py
+++ b/unbiased/sources/base.py
@@ -68,7 +68,7 @@ class NewsSource(object):
return BeautifulSoup(content, 'lxml')
@classmethod
- def _normalize_url(cls, url):
+ def _normalize_url(cls, url, keep_query_vars=None):
"""
Make sure they have a scheme.
Make sure they have a host.
@@ -76,7 +76,20 @@ class NewsSource(object):
"""
cls_url = urllib.parse.urlparse(cls.url)
url = urllib.parse.urlparse(url)
- url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', '', '')
+ if keep_query_vars is None:
+ query = ''
+ else:
+ query_vars = {}
+ qs = urllib.parse.parse_qs(url.query)
+ for v in keep_query_vars:
+ if v in qs:
+ query_vars[v] = qs[v]
+ query_pairs = []
+ for k, i in query_vars.items():
+ for v in i:
+ query_pairs.append('{}={}'.format(k, v))
+ query = '?'.join(query_pairs)
+ url = (url.scheme or cls_url.scheme, url.netloc or cls_url.netloc, url.path, '', query, '')
return urllib.parse.urlunparse(url)
@classmethod