My Fitbit history data was exported from my Fitbit Dashboard.
The goal is to use this data to understand personal fitness habits and trends.
Date Range of Data
For now, the data I will work with is from 2020 as it is the only complete year of data available.
Directory Structure of the Fitbit Data:
/Physical Activity
From here I create separate directories for each data set type.
There are many separate JSON files for each data set. For example, for steps data, 1 example filename is steps-2019-08-21.json.
This process is repeated for all different data sets/directories.
import os
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import shutil
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
os.getcwd()
'c:\\Users\\Brandi\\Documents\\Data Projects\\my-data-story'
os.chdir('c:\\Users\\Brandi\\Documents\\my_fitbit_data\\Physical_Activity')
os.getcwd()
'c:\\Users\\Brandi\\Documents\\my_fitbit_data\\Physical_Activity'
# to be run only once.
# #to list collection of unique file names without timestamp
# filenames = os.listdir()
# print(f"Parsing {len(filenames)} files for unique types.")
# unique_filenames = set()
# for f in filenames:
# unique_filenames.add(f.split("-")[0])
# print(f"Found {len(unique_filenames)} unique types.")
# for name in sorted(unique_filenames):
# print(name)
# to be run only once.
# create new directory with relative path.
# move similarly names files to new directory
# os.mkdir(os.path.join(".","steps_data"))
# dest_dir = os.path.join(".","steps_data")
# for file in glob.glob('steps*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","distance_data"))
# dest_dir = os.path.join(".","distance_data")
# for file in glob.glob('distance*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","calories_data"))
# dest_dir = os.path.join(".","calories_data")
# for file in glob.glob('calories*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","heart_rate_data"))
# dest_dir = os.path.join(".","heart_rate_data")
# for file in glob.glob('heart_rate*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","lightly_active_minutes_data"))
# dest_dir = os.path.join(".","lightly_active_minutes_data")
# for file in glob.glob('lightly*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","moderately_active_minutes_data"))
# dest_dir = os.path.join(".","moderately_active_minutes_data")
# for file in glob.glob('moderately*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","sedentary_minutes_data"))
# dest_dir = os.path.join(".","sedentary_minutes_data")
# for file in glob.glob('sedentary*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","very_active_minutes_data"))
# dest_dir = os.path.join(".","very_active_minutes_data")
# for file in glob.glob('very_active*'):
# shutil.move(file, dest_dir)
# os.mkdir(os.path.join(".","time_in_heart_rate_zones_data"))
# dest_dir = os.path.join(".","time_in_heart_rate_zones_data")
# for file in glob.glob('time_in_heart*'):
# shutil.move(file, dest_dir)
#load all json files into 1 list
dfs = []
for file in os.listdir("steps_data"):
dfs.append(pd.read_json(f"steps_data/{file}"))
#concat the files into one dataframe
df_steps = pd.concat(dfs)
dfs = []
for file in os.listdir("distance_data"):
dfs.append(pd.read_json(f"distance_data/{file}"))
df_distance = pd.concat(dfs)
dfs = []
for file in os.listdir("calories_data"):
dfs.append(pd.read_json(f"calories_data/{file}"))
df_calories = pd.concat(dfs)
#convert date variable to datetime and update in place and then sort by date
#df_calories.dateTime = pd.to_datetime(df_calories.dateTime)
df_calories.set_index("dateTime", drop=True, inplace=True)
df_calories.sort_index(inplace=True)
#df_distance.dateTime = pd.to_datetime(df_distance.dateTime)
df_distance.set_index("dateTime", drop=True, inplace=True)
df_distance.sort_index(inplace=True)
#df_steps.dateTime = pd.to_datetime(df_steps.dateTime)
df_steps.set_index("dateTime", drop=True, inplace=True)
df_steps.sort_index(inplace=True)
# create new columns from datetime index for visualizations
df_calories["year"] = df_calories.index.year
df_calories["month"] = df_calories.index.month
df_calories["day"] = df_calories.index.day
df_calories['weekday'] = df_calories.index.dayofweek
df_calories['weekday_name'] = df_calories.index.day_name()
df_distance["year"] = df_distance.index.year
df_distance["month"] = df_distance.index.month
df_distance["day"] = df_distance.index.day
df_distance['weekday'] = df_distance.index.dayofweek
df_distance['weekday_name'] = df_distance.index.day_name()
df_steps["year"] = df_steps.index.year
df_steps["month"] = df_steps.index.month
df_steps["day"] = df_steps.index.day
df_steps['weekday'] = df_steps.index.dayofweek
df_steps['weekday_name'] = df_steps.index.day_name()
df_steps['weekday_name'] = pd.Categorical(df_steps['weekday_name'], categories=
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
df_calories['weekday_name'] = pd.Categorical(df_calories['weekday_name'], categories=
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
df_distance['weekday_name'] = pd.Categorical(df_distance['weekday_name'], categories=
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
#Converting distance from cm to miles (approximate)
df_distance["value"] = df_distance["value"] / 160934
#checking min and max dates via index
df_calories.first_valid_index()
df_calories.last_valid_index()
df_distance.first_valid_index()
df_distance.last_valid_index()
df_steps.first_valid_index()
df_steps.last_valid_index()
Timestamp('2019-08-21 00:00:00')
Timestamp('2021-07-17 18:00:00')
Timestamp('2019-08-22 15:08:00')
Timestamp('2021-07-16 21:45:00')
Timestamp('2019-08-22 15:08:00')
Timestamp('2021-07-16 21:45:00')
#checking datatypes
df_steps.dtypes
df_distance.dtypes
df_calories.dtypes
value int64 year int64 month int64 day int64 weekday int64 weekday_name category dtype: object
value float64 year int64 month int64 day int64 weekday int64 weekday_name category dtype: object
value float64 year int64 month int64 day int64 weekday int64 weekday_name category dtype: object
df_steps.head(5)
df_calories.head(5)
df_distance.head(5)
value | year | month | day | weekday | weekday_name | |
---|---|---|---|---|---|---|
dateTime | ||||||
2019-08-22 15:08:00 | 0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:09:00 | 0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:10:00 | 0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:13:00 | 0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:14:00 | 0 | 2019 | 8 | 22 | 3 | Thursday |
value | year | month | day | weekday | weekday_name | |
---|---|---|---|---|---|---|
dateTime | ||||||
2019-08-21 00:00:00 | 1.14 | 2019 | 8 | 21 | 2 | Wednesday |
2019-08-21 00:01:00 | 1.14 | 2019 | 8 | 21 | 2 | Wednesday |
2019-08-21 00:02:00 | 1.14 | 2019 | 8 | 21 | 2 | Wednesday |
2019-08-21 00:03:00 | 1.14 | 2019 | 8 | 21 | 2 | Wednesday |
2019-08-21 00:04:00 | 1.14 | 2019 | 8 | 21 | 2 | Wednesday |
value | year | month | day | weekday | weekday_name | |
---|---|---|---|---|---|---|
dateTime | ||||||
2019-08-22 15:08:00 | 0.0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:09:00 | 0.0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:10:00 | 0.0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:13:00 | 0.0 | 2019 | 8 | 22 | 3 | Thursday |
2019-08-22 15:14:00 | 0.0 | 2019 | 8 | 22 | 3 | Thursday |
Comparison between the JSON exported data and the Fitbit online dashboard data show that the data is not precisely matching, but it is fairly close.
drange_steps = df_steps.loc['2021-06-01':'2021-06-30']
drange_distance = df_distance.loc['2021-06-01':'2021-06-30']
drange_calories = df_calories.loc['2021-06-01':'2021-06-30']
june_steps_total = drange_steps["value"].resample("M").sum()
june_distance_total = (drange_distance["value"].resample("M").sum())
june_calories_total = drange_calories["value"].resample("M").sum()
june_steps_total
june_distance_total
june_calories_total
dateTime 2021-06-30 168846 Freq: M, Name: value, dtype: int64
dateTime 2021-06-30 79.704413 Freq: M, Name: value, dtype: float64
dateTime 2021-06-30 68798.59 Freq: M, Name: value, dtype: float64
#create data frames to merge
resampled_steps = pd.DataFrame(df_steps["value"].resample("D").sum())
resampled_calories = pd.DataFrame(df_calories["value"].resample("D").sum())
resampled_distance = pd.DataFrame(df_distance["value"].resample("D").sum())
resampled_steps.rename(columns= {'value':'steps'},inplace=True)
resampled_calories.rename(columns= {'value':'calories'},inplace=True)
resampled_distance.rename(columns= {'value':'distance'},inplace=True)
#merge calorie, step, distance data
combined_steps_calories = pd.merge(resampled_steps, resampled_calories, on=["dateTime"])
combined_all = pd.merge(combined_steps_calories, resampled_distance, on=["dateTime"])
combined_all.head()
steps | calories | distance | |
---|---|---|---|
dateTime | |||
2019-08-22 | 5098 | 2059.92 | 2.468838 |
2019-08-23 | 6097 | 2495.78 | 2.814197 |
2019-08-24 | 9726 | 2879.78 | 4.611704 |
2019-08-25 | 9396 | 2646.37 | 4.455056 |
2019-08-26 | 9103 | 2538.33 | 4.313570 |
#pair plot for quick coorelation analysis
sns.set_theme(style="ticks")
sns.pairplot(combined_all)
<seaborn.axisgrid.PairGrid at 0x1b8ebc2bb80>
sns.regplot(x="steps", y="calories", data=combined_all, scatter_kws={"color":"black"}, line_kws={"color":"blue"})
plt.title("Steps vs Calories")
plt.xlabel("Steps")
plt.ylabel("Calories")
plt.show()
<AxesSubplot:xlabel='steps', ylabel='calories'>
Text(0.5, 1.0, 'Steps vs Calories')
Text(0.5, 0, 'Steps')
Text(0, 0.5, 'Calories')
daily_steps = df_steps['value'].resample('D').sum()
daily_calories = df_calories['value'].resample('D').sum()
daily_distance = df_distance['value'].resample('D').sum()
daily_steps.describe()
daily_calories.describe()
daily_distance.describe()
count 695.000000 mean 5711.004317 std 3029.732596 min 0.000000 25% 4204.000000 50% 5542.000000 75% 7328.000000 max 30883.000000 Name: value, dtype: float64
count 697.000000 mean 2301.589986 std 331.058724 min 1210.720000 25% 2133.820000 50% 2299.670000 75% 2507.790000 max 4201.970000 Name: value, dtype: float64
count 695.000000 mean 2.704453 std 1.445603 min 0.000000 25% 1.984789 50% 2.606099 75% 3.459337 max 14.662160 Name: value, dtype: float64
#Date of max steps
daily_steps.idxmax()
#max step count
daily_steps[daily_steps.idxmax()]
Timestamp('2019-11-26 00:00:00', freq='D')
30883
daily_steps['2021-06'].plot(kind="bar")
plt.axhline(daily_steps['2021-06'].mean(), color='green')
plt.title("Total Daily Steps in June 2021")
plt.xlabel("Date")
plt.ylabel("Steps")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8ebef1040>
Text(0.5, 1.0, 'Total Daily Steps in June 2021')
Text(0.5, 0, 'Date')
Text(0, 0.5, 'Steps')
daily_calories['2021-06'].plot(kind="bar")
plt.axhline(daily_calories['2021-06'].mean(), color='green')
plt.title("Total Daily Calories in June 2021")
plt.xlabel("Date")
plt.ylabel("Calories")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8ebe34070>
Text(0.5, 1.0, 'Total Daily Calories in June 2021')
Text(0.5, 0, 'Date')
Text(0, 0.5, 'Calories')
daily_distance['2021-06'].plot(kind="bar")
plt.axhline(daily_distance['2021-06'].mean(), color='green')
plt.title("Total Daily Distance in June 2021")
plt.xlabel("Date")
plt.ylabel("Miles")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8ebd10640>
Text(0.5, 1.0, 'Total Daily Distance in June 2021')
Text(0.5, 0, 'Date')
Text(0, 0.5, 'Miles')
daily_steps['2020'].plot()
plt.axhline(daily_steps['2020'].mean(), color='green')
plt.title("Total Steps in 2020")
plt.xlabel("Date")
plt.ylabel("Steps")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8f670c850>
Text(0.5, 1.0, 'Total Steps in 2020')
Text(0.5, 0, 'Date')
Text(0, 0.5, 'Steps')
daily_calories['2020'].plot()
plt.axhline(daily_calories['2020'].mean(), color='green')
plt.title("Total Calories in 2020")
plt.xlabel("Date")
plt.ylabel("Calories")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8ebbfc520>
Text(0.5, 1.0, 'Total Calories in 2020')
Text(0.5, 0, 'Date')
Text(0, 0.5, 'Calories')
daily_distance['2020'].plot()
plt.axhline(daily_distance['2020'].mean(), color='green')
plt.title("Total Distance in 2020")
plt.xlabel("Miles")
plt.ylabel("Calories")
<AxesSubplot:xlabel='dateTime'>
<matplotlib.lines.Line2D at 0x1b8ec160250>
Text(0.5, 1.0, 'Total Distance in 2020')
Text(0.5, 0, 'Miles')
Text(0, 0.5, 'Calories')
combined_all.plot()
<AxesSubplot:xlabel='dateTime'>
combined_all["year"] = combined_all.index.year
combined_all["month"] = combined_all.index.month
combined_all['weekday_name'] = combined_all.index.day_name()
combined_all['weekday_name'] = pd.Categorical(combined_all['weekday_name'], categories=
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
g = combined_all.loc['2020'].groupby(["weekday_name"])
avg_weekday_steps = g.aggregate({"steps":np.mean})
avg_weekday_distance = g.aggregate({"distance":np.mean})
avg_weekday_calories = g.aggregate({"calories":np.mean})
avg_weekday_steps.plot(kind="bar")
plt.title("Average Steps")
plt.xlabel("Day")
plt.ylabel("Steps")
plt.show()
<AxesSubplot:xlabel='weekday_name'>
Text(0.5, 1.0, 'Average Steps')
Text(0.5, 0, 'Day')
Text(0, 0.5, 'Steps')
avg_weekday_distance.plot(kind="bar")
plt.title("Average Distance")
plt.xlabel("Day")
plt.ylabel("Distance")
plt.show()
<AxesSubplot:xlabel='weekday_name'>
Text(0.5, 1.0, 'Average Distance')
Text(0.5, 0, 'Day')
Text(0, 0.5, 'Distance')
avg_weekday_calories.plot(kind="bar")
plt.title("Average Calories")
plt.xlabel("Day")
plt.ylabel("Calories")
plt.show()
<AxesSubplot:xlabel='weekday_name'>
Text(0.5, 1.0, 'Average Calories')
Text(0.5, 0, 'Day')
Text(0, 0.5, 'Calories')
g = combined_all.loc['2020'].groupby(["month"])
avg_monthly_steps = g.aggregate({"steps":np.mean})
avg_monthly_distance = g.aggregate({"distance":np.mean})
avg_monthly_calories = g.aggregate({"calories":np.mean})
avg_monthly_steps.plot(kind="bar")
plt.title("Average Steps per Month")
plt.xlabel("Month")
plt.ylabel("Steps")
plt.show()
<AxesSubplot:xlabel='month'>
Text(0.5, 1.0, 'Average Steps per Month')
Text(0.5, 0, 'Month')
Text(0, 0.5, 'Steps')
avg_monthly_distance.plot(kind="bar")
plt.title("Average Distance per Month")
plt.xlabel("Month")
plt.ylabel("Distance")
plt.show()
<AxesSubplot:xlabel='month'>
Text(0.5, 1.0, 'Average Distance per Month')
Text(0.5, 0, 'Month')
Text(0, 0.5, 'Distance')
avg_monthly_calories.plot(kind="bar")
plt.title("Average Calories per Month")
plt.xlabel("Month")
plt.ylabel("Calories")
plt.show()
<AxesSubplot:xlabel='month'>
Text(0.5, 1.0, 'Average Calories per Month')
Text(0.5, 0, 'Month')
Text(0, 0.5, 'Calories')
Exporting finalized and cleaned data for future analysis and projects.
combined_all.to_csv('combined_all.csv')
Correlations
Weekly Data
Monthly Data