From 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 Mon Sep 17 00:00:00 2001
From: Matt Singleton <matt@xcolour.net>
Date: Wed, 19 Apr 2017 16:39:03 -0400
Subject: replace wget with requests library

---
 setup.py                            |  1 +
 unbiased/html_template/unbiased.css |  8 +--
 unbiased/main.py                    |  2 +-
 unbiased/parser.py                  | 29 ++++++-----
 unbiased/unbiasedFunctions.py       | 98 +++++++++++--------------------------
 5 files changed, 49 insertions(+), 89 deletions(-)

diff --git a/setup.py b/setup.py
index 2755304..57c27c0 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@ setup(
     install_requires=[
         'jinja2',
         'Pillow',
+        'requests',
     ],
     entry_points={
         'console_scripts': [
diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css
index 6817cc9..1424ee9 100755
--- a/unbiased/html_template/unbiased.css
+++ b/unbiased/html_template/unbiased.css
@@ -108,8 +108,8 @@ a:hover{
     width:350px;
     height:200px;
     overflow:hidden;
-    background-size: auto 234px;/*cover;*/
-    background-position: top center;/*center center;*/
+    background-size: 100%;
+    background-position: center center;
     margin:0 auto;
 }
 
@@ -169,8 +169,8 @@ a:hover{
     width:150px;
     height:100px;
     overflow:hidden;
-    background-size: auto 117px;/*cover;*/
-    background-position: top center;/*center center;*/
+    background-size: 100%;
+    background-position: center center;
     float:left;
     max-width:35%;
 }
diff --git a/unbiased/main.py b/unbiased/main.py
index c8a113e..c760788 100755
--- a/unbiased/main.py
+++ b/unbiased/main.py
@@ -76,7 +76,7 @@ def run(webroot, scratch):
                     logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex))
 
     #scrape all urls and build data structure
-    newsSourceArr=buildNewsSourceArr(sourceList, scratch)
+    newsSourceArr = sourceList
 
     #build the output file HTML
     outputHTML=buildOutput(newsSourceArr, webroot, scratch)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index 2bba27d..0a8398c 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -4,6 +4,9 @@ import logging
 import os
 import re
 import subprocess
+import urllib.parse
+
+import requests
 
 from unbiased.unbiasedObjects import *
 from unbiased.unbiasedFunctions import buildArticle
@@ -16,21 +19,11 @@ Takes in a URL, downloads the file to a temp file,
 reads the file into a string, and returns that string
 '''
 def urlToContent(url, scratchDir, sourceEncoding='utf8'):
-    temp_file = os.path.join(scratchDir, 'temp1.html')
-
-    #download file
-    #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
-    subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
-    
-    #read file
-    if sourceEncoding=='utf8':
-        f=open(temp_file, 'r', encoding="utf8")
+    res = requests.get(url)
+    if res.status_code == 200:
+        return res.text
     else:
-        f=open(temp_file, 'r', encoding="latin-1")
-    content=f.read()
-    f.close()
-
-    return content
+        raise Exception("Failed to download {}".format(url))
 
 
 '''
@@ -39,6 +32,13 @@ calls the file scraper and appends the new Article object.
 Returns a newsSource2 object
 '''
 def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
+
+    url_parts = urllib.parse.urlparse(url)
+    scheme = url_parts.scheme
+    h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs]
+    h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs]
+    h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
+
     h1Arr=[]
     a=buildArticle(h1URLs[0], name, scratchDir)
     if a==None:
@@ -54,7 +54,6 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
         else:
             logger.debug('H2 Nonetype in '+name)
 
-            
     h3Arr=[]
     for x in h3URLs:
         a=buildArticle(x, name, scratchDir)
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py
index 415a3cc..0181beb 100644
--- a/unbiased/unbiasedFunctions.py
+++ b/unbiased/unbiasedFunctions.py
@@ -1,4 +1,5 @@
 import html
+import io
 import logging
 import os
 import pkgutil
@@ -6,10 +7,12 @@ import random
 import re
 import subprocess
 import time
-
-from unbiased.unbiasedObjects import *
+import urllib.parse
 
 from PIL import Image
+import requests
+
+from unbiased.unbiasedObjects import *
 
 logger = logging.getLogger('unbiased')
 
@@ -21,16 +24,25 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
         logger.debug(sourceName)
         logger.debug(url)
 
-    temp_article = os.path.join(scratchDir, 'temp_article.html')
+    url_parts = urllib.parse.urlparse(url)
+    scheme = url_parts.scheme
 
     #download url
-    #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
-    subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url])
+    try:
+        res = requests.get(url)
+    except Exception as ex:
+        logger.error("""ARTICLE DOWNLOADING ERROR
+        SOURCE:\t{}
+        URL:\t{}""".format(sourceName, url))
+        return None
 
-    #read the file in
-    f=open(temp_article, 'r', encoding="utf8")
-    content=f.read()
-    f.close()
+    if res.status_code == 200:
+        content = res.text
+    else:
+        logger.error("""ARTICLE DOWNLOADING ERROR
+        SOURCE:\t{}
+        URL:\t{}""".format(sourceName, url))
+        return None
 
     try:
         if sourceName=='The Guardian US':
@@ -61,6 +73,8 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t
                 #trim to just before it then lop it off
                 img=img[:-1].strip()
             img=img[:-1]
+        # fix the scheme if it's missing
+        img = urllib.parse.urlparse(img, scheme=scheme).geturl()
 
         if debugging:
             logger.debug(img)
@@ -232,68 +246,16 @@ def printOutputHTML(outputHTML, outDir):
         with open(os.path.join(outDir, filename), 'wb') as fp:
             fp.write(data)
 
-def buildNewsSourceArr(sourceList, scratchDir):
-
-    #build the data structure
-    i=0
-    listLen=len(sourceList)
-    while i < listLen:
-        source=sourceList[i]
-
-        if type(source) is NewsSource2:
-            i+=1
-            continue
-
-        url=source.url
-
-        temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i))
-
-        #download file
-        #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
-        subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
-
-        #read file
-        f=open(temp_file, 'r', encoding="utf8")
-        content=f.read()
-        f.close()
-
-        #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
-        #os.remove(temp_file)
-
-        #add stories etc to the NewsSource object
-        h1s, h2s, h3s=extractURLs(content, source)
-
-        #build the Article objects and add to newsSource's appropriate list
-        if h1s!=None and h2s!=None:
-            for url in h1s:
-                article=buildArticle(url, source.name, scratchDir)
-                if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
-            for url in h2s:
-                article=buildArticle(url, source.name, scratchDir)
-                if article!=None: sourceList[i].h2Arr.append(article)
-            for url in h3s:
-                article=buildArticle(url, source.name, scratchDir)
-                if article!=None: sourceList[i].h3Arr.append(article)
-            i+=1
-        else:
-            sourceList.remove(source)
-            listLen-=1
-
-
-    #return the original sourceList,
-    #since everything should have been modified in place
-    return sourceList
-
 def pullImage(url, index, webroot, scratch, target_width=350, target_height=200):
     extension = url.split('.')[-1].split('?')[0]
     img_name = 'img{}.{}'.format(index, extension)
-    tmp_file = os.path.join(scratch, img_name)
-    try:
-        subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url])
-    except Exception as ex:
-        logger.error('Failed to pull image: url={} ex={}'.format(url, ex))
+    res = requests.get(url)
+    if res.status_code == 200:
+        content = res.content
+    else:
+        logger.error('Image not found: url={}'.format(url))
         return ''
-    img = Image.open(tmp_file)
+    img = Image.open(io.BytesIO(content))
     # crop to aspect ratio
     target_ar = target_width / target_height
     left, top, right, bottom = img.getbbox()
@@ -315,6 +277,4 @@ def pullImage(url, index, webroot, scratch, target_width=350, target_height=200)
     jpg_name = 'img{}.jpg'.format(index)
     out_file = os.path.join(webroot, jpg_name)
     img.save(out_file, 'JPEG')
-    if tmp_file != out_file:
-        os.remove(tmp_file)
     return jpg_name
-- 
cgit v1.2.3