From 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 Mon Sep 17 00:00:00 2001
From: sstvinc2 <sstvinc2@gmail.com>
Date: Tue, 14 Feb 2017 21:02:29 -0600
Subject: New parsing method started

Got NYT up and running with new object type and custom parser
---
 unbiasedFunctions.py | 41 ++++++-----------------------------------
 1 file changed, 6 insertions(+), 35 deletions(-)

(limited to 'unbiasedFunctions.py')

diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index c2f62c0..ef6ae7c 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -3,6 +3,7 @@ import os
 import random
 import time
 
+
 #take in a url and delimiters, return twitter card
 def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
 
@@ -44,41 +45,6 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
         print("Article parsing error in buildArticle() for URL: "+url)
         return None
 
-#do the hardcore HTML parsing
-def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
-    retArr=[]
-    
-    if sectionDividerStart!=None:
-        content=content.split(sectionDividerStart)[1]
-    if sectionDividerEnd!=None:
-        content=content.split(sectionDividerEnd)[0]
-    if delStart!=[]:
-        while True:
-            x=content
-            for delim in delStart:
-                if delim in content:
-                    x=content.split(delim)[1]
-            x=x.split(delEnd)[0]
-            if x not in retArr:
-                retArr.append(x)   
-            content=content.split(delStart[0], 1)
-            if(len(content)==1):
-                break
-            else:
-                content=content[1:][0]
-
-    return retArr
-    
-
-
-'''
-**********************8
-
-Need to fix this function to use splitHTML() and actually loop through
-all of the links instead of just using the first one.
-
-************************
-'''
 
 #take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
 def extractURLs(content, source):
@@ -202,6 +168,11 @@ def buildNewsSourceArr(sourceList):
     listLen=len(sourceList)
     while i < listLen:
         source=sourceList[i]
+
+        if type(source) is NewsSource2:
+            i+=1
+            continue
+
         url=source.url
 
         #download file
-- 
cgit v1.2.3