from unbiasedObjects import *
import os
import random
import time

#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):

    print(sourceName)
    print(url)
    print()
    #download url
    os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)

    #read the file in
    f=open('scratch/temp_article.html', 'r', encoding="utf8")
    content=f.read()
    f.close()

    #because the quote separator could be ' or ", trim to just before it then lop it off
    img=content.split('og:image" content=')[1][1:].split('>')[0]#[:-1]
    if img[-1]=='/':
        img=img[:-1].strip()
    img=img[:-1]
    
    title=content.split('og:title" content=')[1][1:].split('>')[0]#[1].split('"')[0]
    if title[-1]=='/':
        title=title[:-1].strip()
    title=title[:-1]
    
    a=Article(title, url, img, sourceName)
    return a


#do the hardcore HTML parsing
def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
    retArr=[]
    
    if sectionDividerStart!=None:
        content=content.split(sectionDividerStart)[1]
    if sectionDividerEnd!=None:
        content=content.split(sectionDividerEnd)[0]
    if delStart!=[]:
        while True:
            x=content
            for delim in delStart:
                if delim in content:
                    x=content.split(delim)[1]
            x=x.split(delEnd)[0]
            if x not in retArr:
                retArr.append(x)   
            content=content.split(delStart[0], 1)
            if(len(content)==1):
                break
            else:
                content=content[1:][0]

    return retArr
    


'''
**********************8

Need to fix this function to use splitHTML() and actually loop through
all of the links instead of just using the first one.

************************
'''

#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
def extractURLs(content, source):
    h1s=[]
    h2s=[]
    h3s=[]

    h1=content
    if source.h1SectionDividerStart!=None:
        h1=h1.split(source.h1SectionDividerStart)[1]
    if source.h1SectionDividerEnd!=None:
        h1=h1.split(source.h1SectionDividerEnd)[0]
    for delim in source.h1DelStart:
        h1=h1.split(delim)[1]
    h1=h1.split(source.h1DelEnd)[0]
    if '.com' not in h1:
        h1=source.url+h1
    h1s.append(h1)



    h2=content
    if source.h2SectionDividerStart!=None:
        h2=h2.split(source.h2SectionDividerStart, 1)[1]
    if source.h2SectionDividerEnd!=None:
        h2=h2.split(source.h2SectionDividerEnd, 1)[0]

    while source.h2DelStart[0] in h2:
        x=h2
        for delim in source.h2DelStart:
            x=x.split(delim)[1]
            h2=h2.split(delim, 1)[1]
        x=x.split(source.h2DelEnd)[0]
        h2=h2.split(source.h2DelEnd, 1)[1]
        if '.com' not in x:
            x=source.url+x
        h2s.append(x)
    


    
    '''
    h2=content.split(source.h2SectionDividerStart, 1)[1]
    h2=h2.split(source.h2SectionDividerEnd, 1)[0]
    
    if source.h2DelStart!=[]:
        while True:
            x=h2
            for delim in source.h2DelStart:
                if delim in h2:
                    x=h2.split(delim)[1]
            x=x.split(source.h2DelEnd)[0]
            if '.com' not in x:
                x=source.url+x
            if x not in h2s:
                h2s.append(x)
                print(x)
            h2=h2.split(source.h2DelStart[0], 1)
            if(len(h2)==1):
                break
            else:
                h2=h2[1]#:][0]



    h2s=splitHTML(content,
                  source.h2SectionDividerStart,
                  source.h2SectionDividerEnd,
                  source.h2DelStart,
                  source.h2DelEnd)

    if source.h2SectionDividerStart!=None:
        h2=h2.split(source.h2SectionDividerStart)[1]
    if source.h2SectionDividerEnd!=None:
        h2=h2.split(source.h2SectionDividerEnd)[0]

    delim0=source.h2DelStart[0]
    while delim0 in h2:
        for delim in source.h2DelStart:
            url=h2.split(delim)[1]
            h2=''.join(h2.split(delim)[1:])
        url=h2.split(source.h2DelEnd)[0]
        h2=h2.split(source.h2DelEnd)[1]
        if '.com' not in url:
            url=source.url+url
        h2s.append(url)
    print(len(h2s))

    h3s=splitHTML(content,
                  source.h3SectionDividerStart,
                  source.h3SectionDividerEnd,
                  source.h3DelStart,
                  source.h3DelEnd)
    '''

    return h1s, h2s, h3s


def buildOutput(newsSourceArr):
    #read in the template html file
    f=open('html_template/template.html', 'r')
    template=f.read()
    f.close()
    
    #set the random order for sources
    h1RandomSources=random.sample(range(len(newsSourceArr)), 4)
    h2RandomSources=random.sample(range(len(newsSourceArr)), 4)
    '''
    print(h3RandomSources)
    h2RandomSources=random.sample(range(len(newsSourceArr)), 1)
    print(h3RandomSources)
    '''

    #replace html template locations with data from newsSourceArr
    for i in range(len(h1RandomSources)):
        source=newsSourceArr[h1RandomSources[i]]
        randomArticle=random.sample(range(len(source.h1Arr)), 1)[0]
        article=source.h1Arr[randomArticle]
        template=template.replace('xxURL1-'+str(i+1)+'xx', article.url)
        template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title)
        template=template.replace('xxImg1-'+str(i+1)+'xx', article.img)


    for i in range(len(h2RandomSources)):
        source=newsSourceArr[h2RandomSources[i]]
        randomArticle=random.sample(range(len(source.h2Arr)), 1)[0]
        article=source.h2Arr[randomArticle]
        template=template.replace('xxURL2-'+str(i+1)+'xx', article.url)
        template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title)
        template=template.replace('xxImg2-'+str(i+1)+'xx', article.img)



    #return updated text
    return template

def printOutputHTML(outputHTML, outFile):
    timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime())
    outputHTML=outputHTML.replace('xxTimexx', timestamp)
    
    f=open(outFile, 'w')
    f.write(outputHTML)
    f.close()

def buildNewsSourceArr(sourceList):

    #build the data structure
    for i in range(len(sourceList)):
        source=sourceList[i]
        url=source.url

        #download file
        os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)

        #read file
        f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8")
        content=f.read()
        f.close()
        
        #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
        #os.remove('scratch/temp'+str(i)+'.html')

        #add stories etc to the NewsSource object
        h1s, h2s, h3s=extractURLs(content, source)
        
        #build the Article objects and add to newsSource's appropriate list
        for url in h1s:
            article=buildArticle(url, source.name)
            source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
        for url in h2s:
            article=buildArticle(url, source.name)
            sourceList[i].h2Arr.append(article)
        for url in h3s:
            article=buildArticle(url, source.name)
            sourceList[i].h3Arr.append(article)
        
    #return the original sourceList,
    #since everything should have been modified in place
    return sourceList