My Google location history data was exported from Google Timeline.
My goal is to have a timeline of places I have been so that I will have the abliity to join this data with other personal data sets, which will allow me to build a personal data story.
For now, the data I will work with is from 2020 only.
Directory Structure of the Google Data:
Takeout/Location History/Semantic Location History/2020
There are 12 separate JSON files, 1 for each month of 2020. "2020_JANUARY" , "2020_FEBRUARY", etc.
import os
import pandas as pd
import numpy as np
import datetime
import json
import csv
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Make sure current directory is where your 2020 json files are
os.getcwd()
'c:\\Users\\Brandi\\Documents\\Data Projects\\my-data-story'
os.chdir('c:\\Users\\Brandi\\Documents\\Google Data Export\\Takeout\\Location History\\Semantic Location History')
os.getcwd()
'c:\\Users\\Brandi\\Documents\\Google Data Export\\Takeout\\Location History\\Semantic Location History'
#Creates a list from placeVisit data.
def placeVisit(placeVisit_dict):
place_id = placeVisit_dict["location"]["placeId"]
lat = placeVisit_dict["location"]["latitudeE7"]
lon = placeVisit_dict["location"]["longitudeE7"]
place_name = placeVisit_dict["location"]["name"]
address = placeVisit_dict["location"]["address"].replace("\n",", ")
start_time = placeVisit_dict["duration"]["startTimestampMs"]
end_time = placeVisit_dict["duration"]["endTimestampMs"]
confidence = placeVisit_dict["visitConfidence"]
#Formatting variables
lat = int(lat)/1e7
lon = int(lon)/1e7
start_time = timeStampToDate(int(start_time))
end_time = timeStampToDate(int(end_time))
place_visit = [place_id,lat, lon, address, start_time, end_time, confidence, place_name]
return place_visit
#Returns a list of all the waypoints of a activity.
def activitySegment(activitySegment_dict):
start_point = activityStartPoint(activitySegment_dict)
end_point = activityEndPoint(activitySegment_dict)
activity_points = activityRawPoints(activitySegment_dict, start_point)
activity_points.insert(0, start_point)
end_point.insert(1, (len(activity_points)) + 1)
activity_points.append(end_point)
return activity_points
#Set start point of activity as a list.
def activityStartPoint(activitySegment_dict):
trip_id = activitySegment_dict["duration"]["startTimestampMs"]
order = 1
lat = activitySegment_dict["startLocation"]["latitudeE7"]
lon = activitySegment_dict["startLocation"]["longitudeE7"]
time_stamp = timeStampToDate(int(trip_id))
distance = activitySegment_dict.get("distance", 0)
ac_type = activitySegment_dict["activityType"]
confidence = activitySegment_dict["confidence"]
time_convention = timeStampToAMPM(int(trip_id))
#Formatting variables
lat = int(lat)/1e7
lon = int(lon)/1e7
start_point = [trip_id, order, lat, lon, time_stamp, distance, ac_type, confidence, time_convention]
return start_point
#Creates a list of list with each waypoint of activity.
def activityRawPoints(activitySegment_dict, start_point):
points = []
order = 1
if "waypointPath" in activitySegment_dict.keys():
way_points = activitySegment_dict["waypointPath"]["waypoints"]
for point in way_points:
trip_id = start_point[0]
order += 1
lat = int(point["latE7"])/1e7
lon = int(point["lngE7"])/1e7
time_stamp = start_point[4]
distance = start_point[5]
ac_type = start_point[6]
confidence = start_point[7]
time_convention = timeStampToAMPM(int(trip_id))
#Formatting variables
list_point = [trip_id, order, lat, lon, time_stamp, distance, ac_type, confidence, time_convention]
points.append(list_point)
elif "simplifiedRawPath" in activitySegment_dict.keys():
raw_points = activitySegment_dict["simplifiedRawPath"]["points"]
for point in raw_points:
trip_id = start_point[0]
order += 1
lat = int(point["latE7"])/1e7
lon = int(point["lngE7"])/1e7
time_stamp = timeStampToDate(int(point["timestampMs"]))
distance = start_point[5]
ac_type = start_point[6]
confidence = start_point[7]
time_convention = timeStampToAMPM(int(trip_id))
#Formatting variables
list_point = [trip_id, order, lat, lon, time_stamp, distance, ac_type, confidence, time_convention]
points.append(list_point)
return points
#Set end point of activity as a list.
def activityEndPoint(activitySegment_dict):
trip_id = activitySegment_dict["duration"]["startTimestampMs"]
lat = activitySegment_dict["endLocation"]["latitudeE7"]
lon = activitySegment_dict["endLocation"]["longitudeE7"]
time_stamp = activitySegment_dict["duration"]["endTimestampMs"]
distance = activitySegment_dict.get("distance", 0)
ac_type = activitySegment_dict["activityType"]
confidence = activitySegment_dict["confidence"]
time_convention = timeStampToAMPM(int(trip_id))
#Formatting variables
lat = int(lat)/1e7
lon = int(lon)/1e7
time_stamp = timeStampToDate(int(time_stamp))
end_point = [trip_id, lat, lon, time_stamp, distance, ac_type, confidence, time_convention]
return end_point
#Convert milliseconds timestamp into a readable date.
def timeStampToDate(milliseconds):
date = datetime.datetime.fromtimestamp(milliseconds/1000.0)
date = date.strftime('%Y-%m-%d %H:%M:%S')
return date
#Check time convention.
def timeStampToAMPM(milliseconds):
date = datetime.datetime.fromtimestamp(milliseconds/1000.0)
if date.hour < 12:
time_convention = "AM"
else:
time_convention = "PM"
return time_convention
#Method to run all the scripts.
def parse_data(data):
for data_unit in data["timelineObjects"]:
if "activitySegment" in data_unit.keys():
write_activity_points_csv(activitySegment(data_unit["activitySegment"]))
elif "placeVisit" in data_unit.keys():
write_places_csv(placeVisit(data_unit["placeVisit"]))
else:
print("Error")
#CSV writers.
def write_places_csv(place_data_list):
with open('FULL_places.csv', 'a', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(place_data_list)
def write_activity_points_csv(point_data_list):
with open('FULL_activity_points.csv', 'a', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerows(point_data_list)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
for file in os.listdir("2020"):
with open(f"2020/{file}") as f:
data = json.load(f)
parse_data(data)
#column names
colnames1=['place_id', 'lat', 'lon', 'address', 'start_time', 'end_time', 'confidence', 'place_name']
colnames2=['trip_id', 'order', 'lat', 'lon', 'time_stamp', 'distance', 'ac_type', 'confidence', 'time_convention']
places_df = pd.read_csv(r'C:/Users/Brandi/Documents/Google Data Export/Takeout/Location History/Semantic Location History/FULL_places.csv', names=colnames1, header=None)
activity_df = pd.read_csv(r'C:/Users/Brandi/Documents/Google Data Export/Takeout/Location History/Semantic Location History/FULL_activity_points.csv', names=colnames2, header=None)
places_df.dtypes
activity_df.dtypes
places_df.shape
activity_df.shape
place_id object lat float64 lon float64 address object start_time object end_time object confidence int64 place_name object dtype: object
trip_id int64 order int64 lat float64 lon float64 time_stamp object distance int64 ac_type object confidence object time_convention object dtype: object
(2608, 8)
(14184, 9)
#change date from string to datetime
places_df['start_time'] = pd.to_datetime(places_df['start_time'])
activity_df['time_stamp'] = pd.to_datetime(activity_df['time_stamp'])
#new column for date only
places_df['date'] = places_df['start_time'].dt.date
activity_df['date'] = activity_df['time_stamp'].dt.date
#check for NA data and duplicated data
places_df.isna().sum()
activity_df.isna().sum()
places_df.duplicated().sum()
activity_df.duplicated().sum()
place_id 0 lat 0 lon 0 address 0 start_time 0 end_time 0 confidence 0 place_name 0 date 0 dtype: int64
trip_id 0 order 0 lat 0 lon 0 time_stamp 0 distance 0 ac_type 0 confidence 0 time_convention 0 date 0 dtype: int64
1304
7092
#new data frames for main columns of interest
location_df = places_df[['date','address']]
aka_names_df = places_df[['place_name','address']]
location_df.isna().sum()
aka_names_df.isna().sum()
date 0 address 0 dtype: int64
place_name 0 address 0 dtype: int64
location_df.duplicated().sum()
aka_names_df.duplicated().sum()
1561
2420
#remove duplicates
location_df = location_df.drop_duplicates()
location_df.duplicated().sum()
aka_names_df = aka_names_df.drop_duplicates()
aka_names_df.duplicated().sum()
0
0
#check range
location_df['date'].min()
location_df['date'].max()
datetime.date(2020, 1, 1)
datetime.date(2020, 12, 31)
#All data
places_df.to_csv('2020_places.csv')
activity_df.to_csv('2020_activity.csv')
#Subset of data
location_df.to_csv('2020_cleaned_locations.csv')
aka_names_df.to_csv('2020_cleaned_places.csv')