#!/usr/bin/env python3 from unbiasedObjects import * from unbiasedFunctions import buildArticle import os import re ''' Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' def urlToContent(url): #download file os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) #read file f=open('scratch/temp1.html', 'r')#, encoding="utf8") content=f.read() f.close() return content ''' Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h1Arr=[] h1Arr.append(buildArticle(h1URLs[0], name)) h2Arr=[] for x in h2URLs: a=buildArticle(x, name) if a!=None: h2Arr.append(a) h3Arr=[] for x in h3URLs: a=buildArticle(x, name) if a!=None: h3Arr.append(a) #BUILD THE NEWS SOURCE newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) return newsSource ''' Some sites will replicate URLs across the page. This function removes them. Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; if h2 exists in h1s, remove from h2s also check partial URLs (e.g. nytimes.com/story.html is the same as nytimes.com/story.html?var=x ''' def removeDuplicates(h1s, h2s, h3s): #Assume h1s is one element, and keep it #remove h2 duplicates removeArr=[] for i in range(len(h2s)): #check internally for j in range(len(h2s)): if i==j: continue else: if h2s[i] in h2s[j]: removeArr.append(h2s[j]) #check against h1s for k in range(len(h1s)): if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): removeArr.append(h2s[i]) for x in removeArr: h2s.remove(x) #remove h3 duplicates removeArr=[] for i in range(len(h3s)): #check internally for j in range(len(h3s)): if i==j: continue else: if h3s[i] in h3s[j]: removeArr.append(h3s[j]) #check against h1s and h2s h1and2=h1s+h2s for k in range(len(h1and2)): if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): removeArr.append(h3s[i]) for x in removeArr: h3s.remove(x) return h1s, h2s, h3s def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): arr=[source.h1Arr, source.h2Arr, source.h3Arr] if badTitleArr!=None: for i in range(len(arr)): for hed in arr[i]: for item in badTitleArr: if item in hed.title: arr[i].remove(hed) #if it's in the h1 slot, bump up the # first h2 into the h1 slot if i==0: arr[0].append(arr[1][0]) arr[1].remove(arr[1][0]) print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Title ('+item+')\n') if badDescArr!=None: for i in range(len(arr)): for hed in arr[i]: for item in badDescArr: if item in hed.description: arr[i].remove(hed) #if it's in the h1 slot, bump up the # first h2 into the h1 slot if i==0: arr[0].append(arr[1][0]) arr[1].remove(arr[1][0]) print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Description ('+item+')\n') if badAuthorArr!=None: for i in range(len(arr)): for hed in arr[i]: for item in badAuthorArr: if item in hed.author: arr[i].remove(hed) #if it's in the h1 slot, bump up the # first h2 into the h1 slot if i==0: arr[0].append(arr[1][0]) arr[1].remove(arr[1][0]) print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Author ('+item+')\n') if badImgArr!=None: for i in range(len(arr)): for hed in arr[i]: for item in badImgArr: if item in hed.img: arr[i].remove(hed) #if it's in the h1 slot, bump up the # first h2 into the h1 slot if i==0: arr[0].append(arr[1][0]) arr[1].remove(arr[1][0]) print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Image ('+item+')\n') if badURLArr!=None: for i in range(len(arr)): for hed in arr[i]: for item in badURLArr: if item in hed.url: arr[i].remove(hed) #if it's in the h1 slot, bump up the # first h2 into the h1 slot if i==0: arr[0].append(arr[1][0]) arr[1].remove(arr[1][0]) print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: URL ('+item+')\n') return source def buildGuardian(): url='http://www.theguardian.com/us-news' name='The Guardian' #DOWNLOAD HOMEPAGE CONTENT content=urlToContent(url) #get main headline h1=content h1=h1.split('', 3)[2:] for x in h2: x=x.split('

', 1)[1] h3=h3.split('', 1)[0] while '
  • ' in h2: h2=h2.split('
  • ', 1)[1] h2=h2.split('', 1)[1] h2=h2.split('
    ', 1)[1] h3=h3.split('Watch/Listen', 1)[0] while '
    ', 1)[1] h1=h1.split('href="', 1)[1] h1=h1.split('"', 1)[0] h1s=[h1] #GET SECONDARY HEADLINES h2=content h2s=[] h2=h2.split('
    ', 1)[1] h2=h2.split('
    ' in h2: h2=h2.split('
    ', 1)[1] h2=h2.split('href="', 1)[1] x=h2.split('"', 1)[0] if h1 not in x: h2s.append(x) #GET TERTIARY HEADLINES h3=content h3s=[] h3=h3.split('Today\'s Standard', 1)[1] h3=h3.split('
    ' in h3: h3=h3.split('
    ', 1)[1] h3=h3.split('href="', 1)[1] x=h3.split('"', 1)[0] if h1 not in x: h3s.append(x) #Need to add URL prefix to all URLs for i in range(len(h1s)): h1s[i]=url+h1s[i] for i in range(len(h2s)): h2s[i]=url+h2s[i] for i in range(len(h3s)): h3s[i]=url+h3s[i] h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) wkl=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=None ## if flagged again, remove Micah Mattix badDescArr=['Matt Labash'] badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) return wkl def buildNPR(): url='http://www.npr.org/sections/news/' name='NPR' #DOWNLOAD HOMEPAGE CONTENT content=urlToContent(url) #get main headline h1=content h1=h1.split('', 1)[1] h1=h1.split('', 1)[1] h2=h2.split('', 1)[0] while '
    ' in h2: h2=h2.split('
    ', 1)[1] h2=h2.split('', 1)[1] h2=h2.split('
    ', 1)[0] while '', 1)[1] h1=h1.split('', 1)[1] h2=h2.split('', 1)[0] #remove "collection" sets while '