-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgoogleparse.py
More file actions
72 lines (61 loc) · 3.7 KB
/
googleparse.py
File metadata and controls
72 lines (61 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#-------------------------------------------------------------------------------
# Created on 14.12.2011
# @author: Van Der Korn
# @file: googleparse.py
#-------------------------------------------------------------------------------
# -*- coding=utf-8 -*-
from core.searchmachine.googlemachine import *
from core.database.googlemysqlprovider import *
from core.logger.filelogger import *
from core.datetime.datehelper import *
#Library with work
import datetime
import random
import sys,traceback
log=FileLogger(filename="log.html")
connection=MysqlProvider(host='localhost',dbname='ert',user='root',password='')
connection.set_charset(charset='utf8')
keywords=connection.getkeywords()
count_keyword=connection.rowcount
log.writelog("<html><head><title>Log parser</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body><table width=100% border=1><tr><td colspan='5' align=center><b>")
log.writelog("Начало работы скрипта в " + DateHelper.get_time() + ".Количество обрабатоваемых слов " + str(count_keyword) + "</b></td></tr>")
log.writelog("<tr><td align=center>№</td><td>Время</td><td align=center>Домен</td><td align=center>Ключевик</td><td align=center>Позиция</td></tr>")
i=0
timeout_down=8000 #timeout in ms
timeout_up=12000 #timeout in ms
sleep_i=100
timeout_down_interval=30000 #timeout through some requests
timeout_up_interval=60000 #timeout through some requests
timeout_down_error=60000 #timeout for error
timeout_up_error=90000 #timeout for error
current_year,current_month,current_day= DateHelper.get_date()
#for dict_keyword in keywords:
while i < count_keyword:
try:
try:
dict_keyword=keywords[i]
id,keyword,domain,name,city_id=dict_keyword["id"],dict_keyword["key_word"],dict_keyword["domain"],dict_keyword["name"],dict_keyword["city_id"]
useragent=connection.getuseragent() #get random user agent
ip=connection.geiipfromcity(city_id) #get random IP from city
gMachine=GoogleMachine(50,'dsadsadas-ddsadsadas-G53m6qeS7EnJCILGTIkSVLA',1000,domain)
result=gMachine.parse(keyword, useragent, ip)
print result
log.writelog("<tr><td align=center>" + str(i+1) + "</td><td>"+DateHelper.get_time()+"</td><td align=center>" + str(domain) + "</td><td align=center>" + str(keyword) + "</td><td align=center>" + str(result) + "</td></tr>")
connection.insertposition(key_word_id=id, position=result, day=current_day, month=current_month, year=current_year)
timeout=random.uniform(timeout_down,timeout_up)#random real number
time.sleep(timeout/1000)#after request it is required to wait
except:
exc_type,exc_value,exc_trace=sys.exc_info();
log.writelog("<tr><td colspan='5' align=center><b>")
errorstr=traceback.print_exception(exc_type, exc_value, exc_trace, limit=2)
log.writelog(errorstr)
log.writelog("</b></td></tr>")
log.writelog("<tr><td colspan='5' align=center><b>error: index i="+str(i)+" </b></td></tr>")
timeout=random.uniform(timeout_down_error,timeout_up_error)#random real number
time.sleep(timeout/1000)#after request it is required to wait
finally:
if (i%sleep_i==0):
timeout=random.uniform(timeout_down_interval,timeout_up_interval)#random real number
time.sleep(timeout/1000)#after request it is required to wait
i+=1
log.writelog("<tr><td colspan='5' align=center><b>Окончание работы скрипта в " + DateHelper.get_time() + ".Количество обработаных слов " +str(i)+ "</b></td></tr></table></body></html>")