1 files changed, 248 insertions, 0 deletions
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
new file mode 100644
index 0000000..84b4ec9
--- /dev/null
+++ b/unbiasedFunctions.py
@@ -0,0 +1,248 @@
+from unbiasedObjects import *
+import os
+import random
+import time
+
+#take in a url and delimiters, return twitter card
+def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
+
+    print(sourceName)
+    print(url)
+    print()
+    #download url
+    os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
+
+    #read the file in
+    f=open('scratch/temp_article.html', 'r', encoding="utf8")
+    content=f.read()
+    f.close()
+
+    #because the quote separator could be ' or ", trim to just before it then lop it off
+    img=content.split('og:image" content=')[1][1:].split('>')[0]#[:-1]
+    if img[-1]=='/':
+        img=img[:-1].strip()
+    img=img[:-1]
+    
+    title=content.split('og:title" content=')[1][1:].split('>')[0]#[1].split('"')[0]
+    if title[-1]=='/':
+        title=title[:-1].strip()
+    title=title[:-1]
+    
+    a=Article(title, url, img, sourceName)
+    return a
+
+
+#do the hardcore HTML parsing
+def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
+    retArr=[]
+    
+    if sectionDividerStart!=None:
+        content=content.split(sectionDividerStart)[1]
+    if sectionDividerEnd!=None:
+        content=content.split(sectionDividerEnd)[0]
+    if delStart!=[]:
+        while True:
+            x=content
+            for delim in delStart:
+                if delim in content:
+                    x=content.split(delim)[1]
+            x=x.split(delEnd)[0]
+            if x not in retArr:
+                retArr.append(x)   
+            content=content.split(delStart[0], 1)
+            if(len(content)==1):
+                break
+            else:
+                content=content[1:][0]
+
+    return retArr
+    
+
+
+'''
+**********************8
+
+Need to fix this function to use splitHTML() and actually loop through
+all of the links instead of just using the first one.
+
+************************
+'''
+
+#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
+def extractURLs(content, source):
+    h1s=[]
+    h2s=[]
+    h3s=[]
+
+    h1=content
+    if source.h1SectionDividerStart!=None:
+        h1=h1.split(source.h1SectionDividerStart)[1]
+    if source.h1SectionDividerEnd!=None:
+        h1=h1.split(source.h1SectionDividerEnd)[0]
+    for delim in source.h1DelStart:
+        h1=h1.split(delim)[1]
+    h1=h1.split(source.h1DelEnd)[0]
+    if '.com' not in h1:
+        h1=source.url+h1
+    h1s.append(h1)
+
+
+
+    h2=content
+    if source.h2SectionDividerStart!=None:
+        h2=h2.split(source.h2SectionDividerStart, 1)[1]
+    if source.h2SectionDividerEnd!=None:
+        h2=h2.split(source.h2SectionDividerEnd, 1)[0]
+
+    while source.h2DelStart[0] in h2:
+        x=h2
+        for delim in source.h2DelStart:
+            x=x.split(delim)[1]
+            h2=h2.split(delim, 1)[1]
+        x=x.split(source.h2DelEnd)[0]
+        h2=h2.split(source.h2DelEnd, 1)[1]
+        if '.com' not in x:
+            x=source.url+x
+        h2s.append(x)
+    
+
+
+    
+    '''
+    h2=content.split(source.h2SectionDividerStart, 1)[1]
+    h2=h2.split(source.h2SectionDividerEnd, 1)[0]
+    
+    if source.h2DelStart!=[]:
+        while True:
+            x=h2
+            for delim in source.h2DelStart:
+                if delim in h2:
+                    x=h2.split(delim)[1]
+            x=x.split(source.h2DelEnd)[0]
+            if '.com' not in x:
+                x=source.url+x
+            if x not in h2s:
+                h2s.append(x)
+                print(x)
+            h2=h2.split(source.h2DelStart[0], 1)
+            if(len(h2)==1):
+                break
+            else:
+                h2=h2[1]#:][0]
+
+
+
+    h2s=splitHTML(content,
+                  source.h2SectionDividerStart,
+                  source.h2SectionDividerEnd,
+                  source.h2DelStart,
+                  source.h2DelEnd)
+
+    if source.h2SectionDividerStart!=None:
+        h2=h2.split(source.h2SectionDividerStart)[1]
+    if source.h2SectionDividerEnd!=None:
+        h2=h2.split(source.h2SectionDividerEnd)[0]
+
+    delim0=source.h2DelStart[0]
+    while delim0 in h2:
+        for delim in source.h2DelStart:
+            url=h2.split(delim)[1]
+            h2=''.join(h2.split(delim)[1:])
+        url=h2.split(source.h2DelEnd)[0]
+        h2=h2.split(source.h2DelEnd)[1]
+        if '.com' not in url:
+            url=source.url+url
+        h2s.append(url)
+    print(len(h2s))
+
+    h3s=splitHTML(content,
+                  source.h3SectionDividerStart,
+                  source.h3SectionDividerEnd,
+                  source.h3DelStart,
+                  source.h3DelEnd)
+    '''
+
+    return h1s, h2s, h3s
+
+
+def buildOutput(newsSourceArr):
+    #read in the template html file
+    f=open('html_template/template.html', 'r')
+    template=f.read()
+    f.close()
+    
+    #set the random order for sources
+    h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
+    h2RandomSources=random.sample(range(len(newsSourceArr)), 4)
+    '''
+    print(h3RandomSources)
+    h2RandomSources=random.sample(range(len(newsSourceArr)), 1)
+    print(h3RandomSources)
+    '''
+
+    #replace html template locations with data from newsSourceArr
+    for i in range(len(h1RandomSources)):
+        source=newsSourceArr[h1RandomSources[i]]
+        randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
+        article=source.h1Arr[randomArticle]
+        template=template.replace('xxURL1-'+str(i+1)+'xx', article.url)
+        template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title)
+        template=template.replace('xxImg1-'+str(i+1)+'xx', article.img)
+
+
+    for i in range(len(h2RandomSources)):
+        source=newsSourceArr[h2RandomSources[i]]
+        randomArticle=random.sample(range(len(source.h2Arr)), 1)[0]
+        article=source.h2Arr[randomArticle]
+        template=template.replace('xxURL2-'+str(i+1)+'xx', article.url)
+        template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title)
+        template=template.replace('xxImg2-'+str(i+1)+'xx', article.img)
+
+
+
+    #return updated text
+    return template
+
+def printOutputHTML(outputHTML, outFile):
+    timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
+    outputHTML=outputHTML.replace('xxTimexx', timestamp)
+    
+    f=open(outFile, 'w')
+    f.write(outputHTML)
+    f.close()
+
+def buildNewsSourceArr(sourceList):
+
+    #build the data structure
+    for i in range(len(sourceList)):
+        source=sourceList[i]
+        url=source.url
+
+        #download file
+        os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
+
+        #read file
+        f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8")
+        content=f.read()
+        f.close()
+        
+        #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
+        #os.remove('scratch/temp'+str(i)+'.html')
+
+        #add stories etc to the NewsSource object
+        h1s, h2s, h3s=extractURLs(content, source)
+        
+        #build the Article objects and add to newsSource's appropriate list
+        for url in h1s:
+            article=buildArticle(url, source.name)
+            source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
+        for url in h2s:
+            article=buildArticle(url, source.name)
+            sourceList[i].h2Arr.append(article)
+        for url in h3s:
+            article=buildArticle(url, source.name)
+            sourceList[i].h3Arr.append(article)
+        
+    #return the original sourceList,
+    #since everything should have been modified in place
+    return sourceList