summaryrefslogtreecommitdiff
path: root/unbiasedFunctions.py
diff options
context:
space:
mode:
Diffstat (limited to 'unbiasedFunctions.py')
-rw-r--r--unbiasedFunctions.py248
1 files changed, 248 insertions, 0 deletions
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
new file mode 100644
index 0000000..84b4ec9
--- /dev/null
+++ b/unbiasedFunctions.py
@@ -0,0 +1,248 @@
+from unbiasedObjects import *
+import os
+import random
+import time
+
+#take in a url and delimiters, return twitter card
+def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+
+ print(sourceName)
+ print(url)
+ print()
+ #download url
+ os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
+
+ #read the file in
+ f=open('scratch/temp_article.html', 'r', encoding="utf8")
+ content=f.read()
+ f.close()
+
+ #because the quote separator could be ' or ", trim to just before it then lop it off
+ img=content.split('og:image" content=')[1][1:].split('>')[0]#[:-1]
+ if img[-1]=='/':
+ img=img[:-1].strip()
+ img=img[:-1]
+
+ title=content.split('og:title" content=')[1][1:].split('>')[0]#[1].split('"')[0]
+ if title[-1]=='/':
+ title=title[:-1].strip()
+ title=title[:-1]
+
+ a=Article(title, url, img, sourceName)
+ return a
+
+
+#do the hardcore HTML parsing
+def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
+ retArr=[]
+
+ if sectionDividerStart!=None:
+ content=content.split(sectionDividerStart)[1]
+ if sectionDividerEnd!=None:
+ content=content.split(sectionDividerEnd)[0]
+ if delStart!=[]:
+ while True:
+ x=content
+ for delim in delStart:
+ if delim in content:
+ x=content.split(delim)[1]
+ x=x.split(delEnd)[0]
+ if x not in retArr:
+ retArr.append(x)
+ content=content.split(delStart[0], 1)
+ if(len(content)==1):
+ break
+ else:
+ content=content[1:][0]
+
+ return retArr
+
+
+
+'''
+**********************8
+
+Need to fix this function to use splitHTML() and actually loop through
+all of the links instead of just using the first one.
+
+************************
+'''
+
+#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
+def extractURLs(content, source):
+ h1s=[]
+ h2s=[]
+ h3s=[]
+
+ h1=content
+ if source.h1SectionDividerStart!=None:
+ h1=h1.split(source.h1SectionDividerStart)[1]
+ if source.h1SectionDividerEnd!=None:
+ h1=h1.split(source.h1SectionDividerEnd)[0]
+ for delim in source.h1DelStart:
+ h1=h1.split(delim)[1]
+ h1=h1.split(source.h1DelEnd)[0]
+ if '.com' not in h1:
+ h1=source.url+h1
+ h1s.append(h1)
+
+
+
+ h2=content
+ if source.h2SectionDividerStart!=None:
+ h2=h2.split(source.h2SectionDividerStart, 1)[1]
+ if source.h2SectionDividerEnd!=None:
+ h2=h2.split(source.h2SectionDividerEnd, 1)[0]
+
+ while source.h2DelStart[0] in h2:
+ x=h2
+ for delim in source.h2DelStart:
+ x=x.split(delim)[1]
+ h2=h2.split(delim, 1)[1]
+ x=x.split(source.h2DelEnd)[0]
+ h2=h2.split(source.h2DelEnd, 1)[1]
+ if '.com' not in x:
+ x=source.url+x
+ h2s.append(x)
+
+
+
+
+ '''
+ h2=content.split(source.h2SectionDividerStart, 1)[1]
+ h2=h2.split(source.h2SectionDividerEnd, 1)[0]
+
+ if source.h2DelStart!=[]:
+ while True:
+ x=h2
+ for delim in source.h2DelStart:
+ if delim in h2:
+ x=h2.split(delim)[1]
+ x=x.split(source.h2DelEnd)[0]
+ if '.com' not in x:
+ x=source.url+x
+ if x not in h2s:
+ h2s.append(x)
+ print(x)
+ h2=h2.split(source.h2DelStart[0], 1)
+ if(len(h2)==1):
+ break
+ else:
+ h2=h2[1]#:][0]
+
+
+
+ h2s=splitHTML(content,
+ source.h2SectionDividerStart,
+ source.h2SectionDividerEnd,
+ source.h2DelStart,
+ source.h2DelEnd)
+
+ if source.h2SectionDividerStart!=None:
+ h2=h2.split(source.h2SectionDividerStart)[1]
+ if source.h2SectionDividerEnd!=None:
+ h2=h2.split(source.h2SectionDividerEnd)[0]
+
+ delim0=source.h2DelStart[0]
+ while delim0 in h2:
+ for delim in source.h2DelStart:
+ url=h2.split(delim)[1]
+ h2=''.join(h2.split(delim)[1:])
+ url=h2.split(source.h2DelEnd)[0]
+ h2=h2.split(source.h2DelEnd)[1]
+ if '.com' not in url:
+ url=source.url+url
+ h2s.append(url)
+ print(len(h2s))
+
+ h3s=splitHTML(content,
+ source.h3SectionDividerStart,
+ source.h3SectionDividerEnd,
+ source.h3DelStart,
+ source.h3DelEnd)
+ '''
+
+ return h1s, h2s, h3s
+
+
+def buildOutput(newsSourceArr):
+ #read in the template html file
+ f=open('html_template/template.html', 'r')
+ template=f.read()
+ f.close()
+
+ #set the random order for sources
+ h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
+ h2RandomSources=random.sample(range(len(newsSourceArr)), 4)
+ '''
+ print(h3RandomSources)
+ h2RandomSources=random.sample(range(len(newsSourceArr)), 1)
+ print(h3RandomSources)
+ '''
+
+ #replace html template locations with data from newsSourceArr
+ for i in range(len(h1RandomSources)):
+ source=newsSourceArr[h1RandomSources[i]]
+ randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
+ article=source.h1Arr[randomArticle]
+ template=template.replace('xxURL1-'+str(i+1)+'xx', article.url)
+ template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title)
+ template=template.replace('xxImg1-'+str(i+1)+'xx', article.img)
+
+
+ for i in range(len(h2RandomSources)):
+ source=newsSourceArr[h2RandomSources[i]]
+ randomArticle=random.sample(range(len(source.h2Arr)), 1)[0]
+ article=source.h2Arr[randomArticle]
+ template=template.replace('xxURL2-'+str(i+1)+'xx', article.url)
+ template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title)
+ template=template.replace('xxImg2-'+str(i+1)+'xx', article.img)
+
+
+
+ #return updated text
+ return template
+
+def printOutputHTML(outputHTML, outFile):
+ timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
+ outputHTML=outputHTML.replace('xxTimexx', timestamp)
+
+ f=open(outFile, 'w')
+ f.write(outputHTML)
+ f.close()
+
+def buildNewsSourceArr(sourceList):
+
+ #build the data structure
+ for i in range(len(sourceList)):
+ source=sourceList[i]
+ url=source.url
+
+ #download file
+ os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
+
+ #read file
+ f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8")
+ content=f.read()
+ f.close()
+
+ #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
+ #os.remove('scratch/temp'+str(i)+'.html')
+
+ #add stories etc to the NewsSource object
+ h1s, h2s, h3s=extractURLs(content, source)
+
+ #build the Article objects and add to newsSource's appropriate list
+ for url in h1s:
+ article=buildArticle(url, source.name)
+ source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
+ for url in h2s:
+ article=buildArticle(url, source.name)
+ sourceList[i].h2Arr.append(article)
+ for url in h3s:
+ article=buildArticle(url, source.name)
+ sourceList[i].h3Arr.append(article)
+
+ #return the original sourceList,
+ #since everything should have been modified in place
+ return sourceList