1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
#!/usr/bin/env python3
from unbiasedObjects import *
from unbiasedFunctions import *
from parser import *
import time
def main():
while True:
print('-----------------------')
run()
print('-----------------------')
time.sleep(600)
def run():
sourceList=[]
sourceList.append(NewsSource('BBC US',
'http://www.bbc.com/news/world/us_and_canada',
['buzzard-item', '<a href="'],
['top_stories#', '<a href="'],
[],
None, None,
'<div class="pigeon">','<div id=',
None, None,
'http://www.bbc.com'))
sourceList.append(NewsSource('NBC News',
'http://nbcnews.com',
['top-stories-section', 'panel_hero', '<a href="'],
['<div class="story-link', '<a href="'],
[],
None, None,
'ad-content ad-xs mobilebox1', 'taboola-native-top-stories-thumbnail',
None, None))
sourceList.append(NewsSource('CBS News',
'http://cbsnews.com',
['<h1 class="title">', '<a href="'],
['<li data-tb-region-item>', '<a href="'],
[],
None, None, #'Big News Area Side Assets', '</a>'
'Big News Area Side Assets', '</ul></div>',
None, None))
sourceList.append(NewsSource('The Blaze',
'http://theblaze.com',
['<a class="gallery-link" href="'],
['</figure>\n\n<figure class="gallery-item">', 'href="'],
[],
'<!-- home -->', '<!-- loop-home -->',
'<!-- home -->', '<!-- loop-home -->',
None, None))
sourceList.append(NewsSource('Weekly Standard',
'http://www.weeklystandard.com/',
['<div class="lead-photo">', 'href="'],
['<div class="lead-photo">', 'href="'],
[],
'<div id="region_1"', '<div id="region_2"',
'<div class="widget lead-story layout-3col-feature" data-count="2">', '<div id="region_2"',
None, None))
nyt=buildNYT()
sourceList.append(nyt)
fox=buildFoxNews()
sourceList.append(fox)
#scrape all urls and build data structure
newsSourceArr=buildNewsSourceArr(sourceList)
#build the output file HTML
outputHTML=buildOutput(newsSourceArr)
#print the output file HTML
printOutputHTML(outputHTML, '/var/www/html/index.html')
if __name__=="__main__":
main()
|