summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xunbiased/html_template/unbiased.css4
-rwxr-xr-xunbiased/main.py12
-rw-r--r--unbiased/unbiasedFunctions.py32
3 files changed, 36 insertions, 12 deletions
diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css
index 244f100..6817cc9 100755
--- a/unbiased/html_template/unbiased.css
+++ b/unbiased/html_template/unbiased.css
@@ -84,7 +84,7 @@ a:hover{
vertical-align:top;
text-align:left;
width:360px;
- height:auto;
+ height:350px;
overflow:hidden;
background:#fff;
margin:10px;
@@ -217,4 +217,4 @@ a:hover{
clear:both;
padding-top:4em;
font-size:.8em;
-} \ No newline at end of file
+}
diff --git a/unbiased/main.py b/unbiased/main.py
index ea5412d..87b1e8c 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -48,10 +48,12 @@ def run(webroot, scratch):
### E.g. Guardian calls buildGuardian(), etc.
sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS',
'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze'
-
+
for source in sourceFnArr:
+ logger.info('Crawling {}'.format(source))
tries=0
while tries<3:
+ time.sleep(tries)
try:
fn='build'+source
possibles = globals().copy()
@@ -61,10 +63,12 @@ def run(webroot, scratch):
sourceList.append(src)
break
except Exception as ex:
- logger.error('Build error. Looping again. source={} ex={}'.format(source, ex))
tries+=1
- time.sleep(tries)
-
+ if tries == 3:
+ logger.error('Build failed. source={} ex={}'.format(source, ex))
+ else:
+ logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
+
#scrape all urls and build data structure
newsSourceArr=buildNewsSourceArr(sourceList, scratch)
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 775346f..fdf9d8f 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -1,3 +1,4 @@
+import html
import logging
import os
import pkgutil
@@ -32,7 +33,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
f.close()
try:
- if sourceName=='The Guardian':
+ if sourceName=='The Guardian US':
#The Guardian puts an identifying banner on their og:images
#grab the main image from the page instead
@@ -48,14 +49,15 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
elif '<img class="immersive-main-media__media"' in content:
img=content.split('<img class="immersive-main-media__media"', 1)[1]
img=img.split('src="', 1)[1].split('"')[0]
-
+ img = html.unescape(img)
+
else:
if 'og:image' in content:
img=content.split('og:image" content=')[1][1:].split('>')[0]
elif sourceName=='ABC News':
img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
if img[-1]=='/':
- #because the quote separator could be ' or ",
+ #because the quote separator could be ' or ",
#trim to just before it then lop it off
img=img[:-1].strip()
img=img[:-1]
@@ -282,16 +284,34 @@ def buildNewsSourceArr(sourceList, scratchDir):
#since everything should have been modified in place
return sourceList
-def pullImage(url, index, webroot, width=350, height=200):
+def pullImage(url, index, webroot, target_width=350, target_height=200):
extension = url.split('.')[-1].split('?')[0]
img_name = 'img{}.{}'.format(index, extension)
out_file = os.path.join(webroot, img_name)
try:
subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url])
- except Exception:
+ except Exception as ex:
+ logger.error('Failed to pull image: url={} ex={}'.format(url, ex))
return ''
img = Image.open(out_file)
- img.resize((width, height))
+ # crop to aspect ratio
+ target_ar = target_width / target_height
+ left, top, right, bottom = img.getbbox()
+ height = bottom - top
+ width = right - left
+ ar = width / height
+ if target_ar > ar:
+ new_height = (target_height / target_width) * width
+ bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2))
+ img = img.crop(bbox)
+ elif target_ar < ar:
+ new_width = (target_width / target_height) * height
+ bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom)
+ img = img.crop(bbox)
+ # resize if larger
+ if target_width * 2 < width or target_height * 2 < height:
+ img = img.resize((target_width*2, target_height*2), Image.LANCZOS)
+ # TODO: create retina images
jpg_name = 'img{}.jpg'.format(index)
img.save(os.path.join(webroot, jpg_name), 'JPEG')
return jpg_name