#!/usr/bin/env python3
from unbiasedObjects import *
from unbiasedFunctions import buildArticle
import os
import re
'''
Takes in a URL, downloads the file to a temp file,
reads the file into a string, and returns that string
'''
def urlToContent(url):
#download file
os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
#read file
f=open('scratch/temp1.html', 'r')#, encoding="utf8")
content=f.read()
f.close()
return content
'''
Creates a new newsSource2 object. For each URL in h1-h3URLs,
calls the file scraper and appends the new Article object.
Returns a newsSource2 object
'''
def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
h1Arr=[]
h1Arr.append(buildArticle(h1URLs[0], name))
h2Arr=[]
for x in h2URLs:
h2Arr.append(buildArticle(x, name))
h3Arr=[]
for x in h3URLs:
h3Arr.append(buildArticle(x, name))
#BUILD THE NEWS SOURCE
newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
return newsSource
'''
Some sites will replicate URLs across the page. This function removes them.
Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
if h2 exists in h1s, remove from h2s
also check partial URLs (e.g. nytimes.com/story.html is the same as
nytimes.com/story.html?var=x
'''
def removeDuplicates(h1s, h2s, h3s):
#Assume h1s is one element, and keep it
#remove h2 duplicates
removeArr=[]
for i in range(len(h2s)):
#check internally
for j in range(len(h2s)):
if i==j:
continue
else:
if h2s[i] in h2s[j]:
removeArr.append(h2s[j])
#check against h1s
for k in range(len(h1s)):
if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
removeArr.append(h2s[i])
for x in removeArr:
h2s.remove(x)
#remove h3 duplicates
removeArr=[]
for i in range(len(h3s)):
#check internally
for j in range(len(h3s)):
if i==j:
continue
else:
if h3s[i] in h3s[j]:
removeArr.append(h3s[j])
#check against h1s and h2s
h1and2=h1s+h2s
for k in range(len(h1and2)):
if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
removeArr.append(h3s[i])
for x in removeArr:
h3s.remove(x)
return h1s, h2s, h3s
def removeBadStories(source, badDescArr, badAuthorArr, badImgArr):
if badAuthorArr!=None:
for h1 in source.h1Arr:
for item in badAuthorArr:
if item in h1.author:
source.h1Arr.remove(h1)
#if it's in the h1 slot, bump up the first h2 into the h1 slot
source.h1Arr.append(source.h2Arr[0])
source.h2Arr.remove(source.h2Arr[0])
print('removed '+h1.title+' from '+source.name+' Reason: bad author')
for h2 in source.h2Arr:
for item in badAuthorArr:
if item in h2.author:
source.h2Arr.remove(h2)
print('removed '+h2.title+' from '+source.name+' Reason: bad author')
for h3 in source.h3Arr:
for item in badAuthorArr:
if item in h3.author:
source.h3Arr.remove(h3)
print('removed '+h3.title+' from '+source.name+' Reason: bad author')
if badDescArr!=None:
for h1 in source.h1Arr:
for item in badDescArr:
if item in h1.description:
source.h1Arr.remove(h1)
#if it's in the h1 slot, bump up the first h2 into the h1 slot
source.h1Arr.append(source.h2Arr[0])
source.h2Arr.remove(source.h2Arr[0])
print('removed '+h1.title+' from '+source.name+' Reason: bad description')
for h2 in source.h2Arr:
for item in badDescArr:
if item in h2.description:
source.h2Arr.remove(h2)
print('removed '+h2.title+' from '+source.name+' Reason: bad description')
for h3 in source.h3Arr:
for item in badDescArr:
if item in h3.description:
source.h3Arr.remove(h3)
print('removed '+h3.title+' from '+source.name+' Reason: bad description')
if badImgArr!=None:
for h1 in source.h1Arr:
for item in badImgArr:
if item in h1.img:
source.h1Arr.remove(h1)
#if it's in the h1 slot, bump up the first h2 into the h1 slot
source.h1Arr.append(source.h2Arr[0])
source.h2Arr.remove(source.h2Arr[0])
print('removed '+h1.title+' from '+source.name+' Reason: bad image')
for h2 in source.h2Arr:
for item in badImgArr:
if item in h2.img:
source.h2Arr.remove(h2)
print('removed '+h2.title+' from '+source.name+' Reason: bad image')
for h3 in source.h3Arr:
for item in badImgArr:
if item in h3.img:
source.h3Arr.remove(h3)
print('removed '+h3.title+' from '+source.name+' Reason: bad image')
return source
'''
Function to fix the oddly short og:descriptions provided
in The Blaze articles by grabbing the first portion of the story instead
'''
def blazeFixDesc(articleArr):
TAG_RE = re.compile(r'<[^>]+>')
for i in range(len(articleArr)):
desc=urlToContent(articleArr[i].url)
desc=desc.split('