From 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Tue, 14 Feb 2017 21:02:29 -0600 Subject: New parsing method started Got NYT up and running with new object type and custom parser --- parser.py | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 parser.py (limited to 'parser.py') diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2020f55 --- /dev/null +++ b/parser.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os + +def buildNYT(): + url='http://www.nytimes.com' + + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + #get main headline + #this will likely need if/else logic + h1=content + + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h2.split('