Here it is a Python 2.x script to login to Google from the command line and to download what Google records in the Web History.
Google offers an RSS feed with ones items recorded in his account's Google Web History. The script logs into Google and downloads the feed. The feed is always 25 items long.
#!/usr/bin/python
# -*- coding: utf8 -*-
#
# downloadgooglehistory.py, v1.0, 2010-11-14
#
# Copyright (c) 2010 Adrian Cozma, http://pte.ro
#
# Released under GPL 2.0 strict
#
# Full license text at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
################################################################################
################################################################################
##
## user editable
##
userAgent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
download = True
saveToFile = True
logindata = (
('username[at]gmail[dot]com','password'),
)
dateValue = ( 2010, 10, 1, 0, 0, 0 ) # year, month, day, hour, minute, second
##
## end of user editable
##
################################################################################
################################################################################
import cookielib
import datetime
import html5lib
import re
import sys
import time
import urllib
import urllib2
################################################################################
################################################################################
##
## functions
##
def saveToFile (filename,data):
f = open(filename,'w')
f.write(data)
f.close()
################################################################################
## setting up the cookie system
jar = cookielib.LWPCookieJar()
# let's not save the cookies to the disk
### COOKIEFILE = "./" + subdirectorytosavefiles + "/cookies.lwp"
### try:
### jar.load(COOKIEFILE)
### except cookielib.LoadError:
### jar.save(COOKIEFILE)
### jar.load(COOKIEFILE)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
urllib2.install_opener(opener)
def goTo( url, postDictionary=None, headers=None, info=None, verbose=True ):
failure = False
if None != postDictionary:
data = urllib.urlencode(postDictionary)
else:
data = None
try:
req = urllib2.Request ( url, data=data, headers=headers )
handle = urllib2.urlopen (req)
except IOError,e:
report = True
failure= True
print 'Failed to open "%s".' % url
if hasattr (e, 'code'):
print 'Failed with error code %s.' % e.code
if hasattr (e, 'reason'):
print "Failed with 'reason' attribute: " + str(e.reason)
if verbose:
print
print "[ info ]",info
print "[ URL ]", url
print "[ postData ]", postDictionary
print "[ headers ]", headers
print
if not failure:
print handle.info()
print
if failure:
sys.exit()
return handle
################################################################################
## helper functions for parsing google login form
def __getTagValueForId (iterable,idValue):
# token is a dictionary
for token in iterable:
if u'data' in token.keys():
# data can be of any type, so we check for lists
if type(token[u'data']) == type(list()):
for k,v in token[u'data']:
# but not any list, only those containing id==idValue
if k.lower()==u'id' and v==idValue:
# we have found the element
# now let's iterate it again to find the value
for retk,retv in token[u'data']:
if retk.lower()==u'value':
return retv
#print "no element for id",idValue
return None
def __getTagValueForName (iterable,nameValue):
# token is a dictionary
for token in iterable:
if u'data' in token.keys():
# data can be of any type, so we check for lists
if type(token[u'data']) == type(list()):
for k,v in token[u'data']:
# but not any list, only those containing id==idValue
if k.lower()==u'name' and v==nameValue:
# we have found the element
# now let's iterate it again to find the value
for retk,retv in token[u'data']:
if retk.lower()==u'value':
return retv
#print "no element for name",nameValue
return None
def __addPostDataIfIdOrNameExists ( dictionary, stream, valueNameOrId ):
value = __getTagValueForId (stream,valueNameOrId)
if None != value:
dictionary[valueNameOrId] = value
return
value = __getTagValueForName (stream,valueNameOrId)
if None != value:
dictionary[valueNameOrId] = value
################################################################################
def googleGetHistory ( direction, year, month, day, hour=0, minute=0, second=0, verbose=False ):
if direction not in ['min','max']:
print 'error'
sys.exit()
datetimeValue = datetime.datetime(year,month,day,hour,minute,second,tzinfo=None)
unixtime = time.mktime(datetimeValue.timetuple())
query = "q="
query += "&output=rss"
query += "&" + direction + "=" + ('%d' % unixtime) + "000000"
#query += "&items=100"
url = 'http://www.google.com/history/lookup?' + query
handler = goTo( url=url, postDictionary={}, headers={}, info='rss', verbose=verbose )
page = handler.read()
page = re.sub("""><""",">\n<",page) # page might be compressed, so we break it into lines
return (query,page)
################################################################################
def googleLogin ( googleuser, googlepass, userAgent, verbose=True ):
url = 'https://www.google.com/accounts/ServiceLoginAuth'
postDict = None
txheaders = {'User-agent' : userAgent}
handle = goTo( url, None, txheaders, "going to goole.com", verbose=verbose )
############################################################################
### received page:
page = handle.read()
parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom") )
tree = parser.parse(page)
walker = html5lib.treewalkers.getTreeWalker ("dom")
stream = walker(tree) # stream is an iterable
### constructing the request to login
url = "https://www.google.com/accounts/ServiceLoginAuth"
postDict = {}
__addPostDataIfIdOrNameExists ( postDict, stream, 'continue' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'service' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'nui' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'dsh' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'hl' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'timeStmp' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'secTok' )
__addPostDataIfIdOrNameExists ( postDict, stream, 'GALX' )
postDict['Email'] = googleuser
postDict['Passwd'] = googlepass
postDict['remember'] = '1'
handle = goTo ( url, postDict, txheaders, "logging in", verbose=verbose )
############################################################################
# now is there a redirect ?
page = handle.read()
page = re.sub("""[\n,\r]+"""," ",page) # page may have newlines, so get rid of them to do search
srch = re.search("""location\.replace\(["|'](.*)["|']\)""",page)
###
if None!=srch:
# redirecting
url = srch.group(1)
handle = goTo ( url, postDict, txheaders, "redirecting", verbose=verbose )
page = handle.read()
############################################################################
# login to google should be complete
################################################################################
################################################################################
##
## Main logic
##
## (check the parameters at the top of the file)
##
def downloadGoogleHistory (
googleuser, googlepass, direction, year, month, day, hour, minute, second, save=True
):
googleLogin ( googleuser=googleuser, googlepass=googlepass, userAgent=userAgent, verbose=False )
query,page = googleGetHistory (direction,year,month,day,hour,minute,second)
filename = None
if save: # let's dave to file
filename = str(year) + "-" + str(month).zfill(2) + "-" + str(day).zfill(2)
filename += " "
filename += str(hour).zfill(2) + ":" + str(minute).zfill(2) + ":" + str(second).zfill(2)
filename += " " + direction + " "
filename += googleuser
filename = filename + ".xml"
saveToFile(filename,page);
return filename,page
###
if __name__ == "__main__":
for login in logindata:
googleuser = login[0]
googlepass = login[1]
print "Logging in with user: '%s' and password: '%s'" % (googleuser,googlepass)
print "(if an error follows check the values at the top of the file)"
print
## min -> downloads 25 items AFTER dateValue
filename,page = downloadGoogleHistory (googleuser,googlepass,'min',*dateValue,save=saveToFile)
print
print page
print
print
## max -> downloads 25 items BEFORE dateValue
filename,page = downloadGoogleHistory (googleuser,googlepass,'max',*dateValue,save=saveToFile)
print
print page
print
print
Firstly unpack the archive.
You must edit the top of the script (the area marked as 'user editable') and write your Google username and password, otherwise the script will fail to login. BTW, no login data is saved on the disk (e.g. the cookies are not saved) excepting your edits inside the script file, so pay attention where you keep the script (definitely not in shared directories). Also, don't send the script to others (e.g. by e-mail) with your login data in it. You have been warned!
In Unices (Linux, OSX), you can also mark the script as executable (a one time action):