- -

F.B.I. Questioned Flynn About Russia Call

- -

-''' +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os +import re + + +''' +Takes in a URL, downloads the file to a temp file, +reads the file into a string, and returns that string +''' +def urlToContent(url): + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + return content + + +''' +Creates a new newsSource2 object. For each URL in h1-h3URLs, +calls the file scraper and appends the new Article object. +Returns a newsSource2 object +''' +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): + h1Arr=[] + h1Arr.append(buildArticle(h1URLs[0], name)) + + h2Arr=[] + for x in h2URLs: + a=buildArticle(x, name) + if a!=None: + h2Arr.append(a) + + h3Arr=[] + for x in h3URLs: + a=buildArticle(x, name) + if a!=None: + h3Arr.append(a) + + #BUILD THE NEWS SOURCE + newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) + + return newsSource + + +''' +Some sites will replicate URLs across the page. This function removes them. +Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; +if h2 exists in h1s, remove from h2s + +also check partial URLs (e.g. nytimes.com/story.html is the same as +nytimes.com/story.html?var=x +''' +def removeDuplicates(h1s, h2s, h3s): + #Assume h1s is one element, and keep it + + #remove h2 duplicates + removeArr=[] + for i in range(len(h2s)): + #check internally + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + #check against h1s + for k in range(len(h1s)): + if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): + removeArr.append(h2s[i]) + for x in removeArr: + h2s.remove(x) + + #remove h3 duplicates + removeArr=[] + for i in range(len(h3s)): + #check internally + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + #check against h1s and h2s + h1and2=h1s+h2s + for k in range(len(h1and2)): + if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): + removeArr.append(h3s[i]) + for x in removeArr: + h3s.remove(x) + + + return h1s, h2s, h3s + + + +def removalNotification(source, title, reason, value): + print('*************************') + print('\t\tSTORY REMOVED') + print('SOURCE: '+source) + print('TITLE: \t'+title) + print('REASON: '+reason) + print('VALUE: \t'+value) + print('*************************\n\n') + + +def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): + + arr=[source.h1Arr, source.h2Arr, source.h3Arr] + + if badTitleArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badTitleArr: + if item in hed.title: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Title', item) + + + if badDescArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badDescArr: + if item in hed.description: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Description', item) + + + if badAuthorArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badAuthorArr: + if item in hed.author: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Author', item) + + + if badImgArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badImgArr: + if item in hed.img: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Image', item) + + if badURLArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badURLArr: + if item in hed.url: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'URL', item) + + return source + + + + +def buildTheHill(): + url='http://thehill.com' + name='The Hill' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('

', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '

', 1)[1] + h3=h3.split('', 1)[0] + while '

', 3)[2:] + for x in h2: + x=x.split('

', 1)[1] + h3=h3.split('
', 1)[0] + #this story section goes on forever; just grab the first 5 + while '
]+>') + for i in range(len(articleArr)): + desc=urlToContent(articleArr[i].url) + desc=desc.split('
', 1)[1] + desc=desc.split('
', 1)[1] + desc=TAG_RE.sub('', desc) + desc=desc.replace('\n', ' ') + desc=desc[:144] + articleArr[i].description=desc + + return articleArr + + + +def buildBlaze(): + url='http://theblaze.com' + name='The Blaze' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[0] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '\n\n
' in h2: + h2=h2.split('
\n\n
', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(url+x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('', 1)[1] + #this story section goes on forever; just grab the first 5 + while len(h3s)<5: + h3=h3.split('' in content: + h1=h1.split('
', 1)[1] + h1=h1.split('
', 1)[0] + h1=h1.split('
', 1)[1] + h1=h1.split('
', 1)[0] + while '
' in h2: + h2=h2.split('
', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
', 1)[1] + h3=h3.split('Watch/Listen', 1)[0] + while '
', 1)[1] + h1=h1.split('href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('
', 1)[1] + h2=h2.split('
' in h2: + h2=h2.split('
', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('Today\'s Standard', 1)[1] + h3=h3.split('
' in h3: + h3=h3.split('
', 1)[1] + h3=h3.split('href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + #Need to add URL prefix to all URLs + for i in range(len(h1s)): + h1s[i]=url+h1s[i] + for i in range(len(h2s)): + h2s[i]=url+h2s[i] + for i in range(len(h3s)): + h3s[i]=url+h3s[i] + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + ## if flagged again, remove Micah Mattix + badDescArr=['Matt Labash'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] + badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] + wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return wkl + + + + +def buildNPR(): + url='http://www.npr.org/sections/news/' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
' in h2: + h2=h2.split('
', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
', 1)[0] + while '', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h2.split('
', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h2.split('
', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h3=h3.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h3.split('
', 1) + h3=arr[0]+arr[1].split('', 1)[1] + + #Grab the remaining URLs + while ' + +
+ +
+
Top News
+ +
+ +
+
+
The 45th President
+
+ +
+ +
+ +
+ +
+
+
F.B.I. Questioned Flynn About Russia Call
+
+
+ +
+''' -- cgit v1.2.3