Check out my latest game, Cupid Strikes Back !

Downloading Google Web History using Python

Here it is a Python 2.x script to login to Google from the command line and to download what Google records in the Web History.

What the script does

Google offers an RSS feed with ones items recorded in his account's Google Web History. The script logs into Google and downloads the feed. The feed is always 25 items long.

There are two GET parameters: min and max. I think they are mutually exclusive. Their value is the date & time in Unix time format (to which it is appended either "000000", or another value I don't know its meaning).

min returns 25 entries starting from the specified date.

max returns 25 entries up to the specified date.

The script

#!/usr/bin/python
# -*- coding: utf8 -*-

#
# downloadgooglehistory.py, v1.0, 2010-11-14
#
# Copyright (c) 2010 Adrian Cozma, http://pte.ro
#
# Released under GPL 2.0 strict
#
# Full license text at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#


################################################################################
################################################################################
##
## user editable
##

userAgent  = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

download   = True
saveToFile = True

logindata = (
  ('username[at]gmail[dot]com','password'),
)

dateValue = ( 2010, 10, 1, 0, 0, 0 ) # year, month, day, hour, minute, second

##
## end of user editable
##
################################################################################
################################################################################


import cookielib
import datetime
import html5lib
import re
import sys
import time
import urllib
import urllib2


################################################################################
################################################################################
##
## functions
##


def saveToFile (filename,data):
  f = open(filename,'w')
  f.write(data)
  f.close()


################################################################################


## setting up the cookie system
jar = cookielib.LWPCookieJar()
# let's not save the cookies to the disk
### COOKIEFILE = "./" + subdirectorytosavefiles + "/cookies.lwp"
### try:
###   jar.load(COOKIEFILE)
### except cookielib.LoadError:
###   jar.save(COOKIEFILE)
###   jar.load(COOKIEFILE)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
urllib2.install_opener(opener)


def goTo( url, postDictionary=None, headers=None, info=None, verbose=True ):
  failure = False

  if None != postDictionary:
    data = urllib.urlencode(postDictionary)
  else:
    data = None

  try:
    req    = urllib2.Request ( url, data=data, headers=headers )
    handle = urllib2.urlopen (req)
  except IOError,e:
    report = True
    failure= True
    print 'Failed to open "%s".' % url
    if hasattr (e, 'code'):
      print 'Failed with error code %s.' % e.code
    if hasattr (e, 'reason'):
      print "Failed with 'reason' attribute: " + str(e.reason)

  if verbose:
    print
    print "[ info     ]",info
    print "[ URL      ]", url
    print "[ postData ]", postDictionary
    print "[ headers  ]", headers
    print
    if not failure:
      print handle.info()
      print

  if failure:
    sys.exit()

  return handle


################################################################################
## helper functions for parsing google login form


def __getTagValueForId (iterable,idValue):
  # token is a dictionary
  for token in iterable:
    if u'data' in token.keys():
      # data can be of any type, so we check for lists
      if type(token[u'data']) == type(list()):
        for k,v in token[u'data']:
          # but not any list, only those containing id==idValue
          if k.lower()==u'id' and v==idValue:
            # we have found the element
            # now let's iterate it again to find the value
            for retk,retv in token[u'data']:
              if retk.lower()==u'value':
                return retv
  #print "no element for id",idValue
  return None


def __getTagValueForName (iterable,nameValue):
  # token is a dictionary
  for token in iterable:
    if u'data' in token.keys():
      # data can be of any type, so we check for lists
      if type(token[u'data']) == type(list()):
        for k,v in token[u'data']:
          # but not any list, only those containing id==idValue
          if k.lower()==u'name' and v==nameValue:
            # we have found the element
            # now let's iterate it again to find the value
            for retk,retv in token[u'data']:
              if retk.lower()==u'value':
                return retv
  #print "no element for name",nameValue
  return None


def __addPostDataIfIdOrNameExists ( dictionary, stream, valueNameOrId ):

  value = __getTagValueForId (stream,valueNameOrId)
  if None != value:
    dictionary[valueNameOrId] = value
    return

  value = __getTagValueForName (stream,valueNameOrId)
  if None != value:
    dictionary[valueNameOrId] = value


################################################################################


def googleGetHistory ( direction, year, month, day, hour=0, minute=0, second=0, verbose=False ):
  if direction not in ['min','max']:
    print 'error'
    sys.exit()

  datetimeValue = datetime.datetime(year,month,day,hour,minute,second,tzinfo=None)
  unixtime = time.mktime(datetimeValue.timetuple())

  query  = "q="
  query += "&output=rss"
  query += "&" + direction + "=" + ('%d' % unixtime) + "000000"
  #query += "&items=100"

  url = 'http://www.google.com/history/lookup?' + query

  handler = goTo( url=url, postDictionary={}, headers={}, info='rss', verbose=verbose )

  page = handler.read()
  page = re.sub("""><""",">\n<",page) # page might be compressed, so we break it into lines

  return (query,page)


################################################################################


def googleLogin ( googleuser, googlepass, userAgent, verbose=True ):

  url       = 'https://www.google.com/accounts/ServiceLoginAuth'
  postDict  = None
  txheaders = {'User-agent' : userAgent}

  handle    = goTo( url, None, txheaders, "going to goole.com", verbose=verbose )

  ############################################################################

  ### received page:

  page   = handle.read()
  parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom") )
  tree   = parser.parse(page)
  walker = html5lib.treewalkers.getTreeWalker ("dom")
  stream = walker(tree) # stream is an iterable

  ### constructing the request to login

  url = "https://www.google.com/accounts/ServiceLoginAuth"

  postDict = {}

  __addPostDataIfIdOrNameExists ( postDict, stream, 'continue' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'service' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'nui' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'dsh' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'hl' )

  __addPostDataIfIdOrNameExists ( postDict, stream, 'timeStmp' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'secTok' )
  __addPostDataIfIdOrNameExists ( postDict, stream, 'GALX' )

  postDict['Email']    = googleuser
  postDict['Passwd']   = googlepass
  postDict['remember'] = '1'

  handle = goTo ( url, postDict, txheaders, "logging in", verbose=verbose )

  ############################################################################

  # now is there a redirect ?

  page = handle.read()
  page = re.sub("""[\n,\r]+"""," ",page) # page may have newlines, so get rid of them to do search
  srch = re.search("""location\.replace\(["|'](.*)["|']\)""",page)

  ###

  if None!=srch:
    # redirecting
    url    = srch.group(1)
    handle = goTo ( url, postDict, txheaders, "redirecting", verbose=verbose )
    page   = handle.read()

  ############################################################################

  # login to google should be complete


################################################################################
################################################################################
##
## Main logic
##
## (check the parameters at the top of the file)
##


def downloadGoogleHistory (
    googleuser, googlepass, direction, year, month, day, hour, minute, second, save=True
    ):

  googleLogin ( googleuser=googleuser, googlepass=googlepass, userAgent=userAgent, verbose=False )

  query,page = googleGetHistory (direction,year,month,day,hour,minute,second)

  filename = None

  if save: # let's dave to file
    filename  = str(year) + "-" + str(month).zfill(2) + "-" + str(day).zfill(2)
    filename += " "
    filename += str(hour).zfill(2) + ":" + str(minute).zfill(2) + ":" + str(second).zfill(2)
    filename += " " + direction + " "
    filename += googleuser

    filename = filename + ".xml"
  
    saveToFile(filename,page);

  return filename,page


###


if __name__ == "__main__":

  for login in logindata:
    googleuser = login[0]
    googlepass = login[1]
    print "Logging in with user: '%s' and password: '%s'" % (googleuser,googlepass)
    print "(if an error follows check the values at the top of the file)"
    print

    ## min -> downloads 25 items AFTER dateValue
    filename,page = downloadGoogleHistory (googleuser,googlepass,'min',*dateValue,save=saveToFile)

    print
    print page
    print
    print

    ## max -> downloads 25 items BEFORE dateValue
    filename,page = downloadGoogleHistory (googleuser,googlepass,'max',*dateValue,save=saveToFile)

    print
    print page
    print
    print



Download

 
FileFile size
Download this file (downloadgooglehistory.py.tar.gz)downloadgooglehistory.py.tar.gz3 Kb

Usage (for beginners)

Firstly unpack the archive.

You must edit the top of the script (the area marked as 'user editable') and write your Google username and password, otherwise the script will fail to login. BTW, no login data is saved on the disk (e.g. the cookies are not saved) excepting your edits inside the script file, so pay attention where you keep the script (definitely not in shared directories). Also, don't send the script to others (e.g. by e-mail) with your login data in it. You have been warned!

On any platform, you can invoke the script with:

python downloadgooglehistory.py

In Unices (Linux, OSX), you can also mark the script as executable (a one time action):

chmod 755 ./downloadgooglehistory.py

in order to invoke it like this:

./downloadgooglehistory.py