diff --git a/spotifyvis/views.py b/spotifyvis/views.py index e9ca559..5aa780c 100644 --- a/spotifyvis/views.py +++ b/spotifyvis/views.py @@ -168,11 +168,12 @@ def get_audio_features(track_id, headers): return features_dict -def update_std_dev(cur_mean, new_data_point, sample_size): +def update_std_dev(cur_mean, cur_std_dev, new_data_point, sample_size): """Calculates the standard deviation for a sample without storing all data points Args: cur_mean: the current mean for N = (sample_size - 1) + cur_std_dev: the current standard deviation for N = (sample_size - 1) new_data_point: a new data point sample_size: sample size including the new data point @@ -182,8 +183,12 @@ def update_std_dev(cur_mean, new_data_point, sample_size): # This is an implementationof Welford's method # http://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/ new_mean = ((sample_size - 1) * cur_mean + new_data_point) / sample_size - std_dev = (new_data_point - new_mean) * (new_data_point - cur_mean) - return new_mean, std_dev + delta_variance = (new_data_point - new_mean) * (new_data_point - cur_mean) + new_std_dev = math.sqrt( + (math.pow(cur_std_dev, 2) * (sample_size - 2) + delta_variance) / ( + sample_size - 1 + )) + return new_mean, new_std_dev def update_audio_feature_stats(feature, new_data_point, sample_size): @@ -203,13 +208,13 @@ def update_audio_feature_stats(feature, new_data_point, sample_size): "average": new_data_point, "std_dev": 0, } - else: current_mean = library_stats['audio_features'][feature]['average'] - updated_mean, std_dev = update_std_dev(current_mean, new_data_point, sample_size) + cur_std_dev = library_stats['audio_features'][feature]['std_dev'] + updated_mean, new_std_dev = update_std_dev(current_mean, cur_std_dev, new_data_point, sample_size) library_stats['audio_features'][feature]['average'] = updated_mean - library_stats['audio_features'][feature]['std_dev'] = std_dev + library_stats['audio_features'][feature]['std_dev'] = new_std_dev # parse_library {{{ # @@ -228,12 +233,19 @@ def parse_library(headers, tracks): # keeps track of point to get songs from offset = 0 payload = {'limit': str(limit)} - for i in range(0, tracks, limit): + for _ in range(0, tracks, limit): payload['offset'] = str(offset) saved_tracks_response = requests.get('https://api.spotify.com/v1/me/tracks', headers=headers, params=payload).json() + num_samples = offset for track_dict in saved_tracks_response['items']: + # Track the number of samples for calculating + # audio feature averages and standard deviations on the fly + num_samples += 1 get_track_info(track_dict['track']) # get_genre(headers, track_dict['track']['album']['id']) + audio_features_dict = get_audio_features(track_dict['id'], headers) + for feature, feature_data in audio_features_dict.items(): + update_audio_feature_stats(feature, feature_data, num_samples) for artist_dict in track_dict['track']['artists']: increase_artist_count(headers, artist_dict['name'], artist_dict['id']) # calculates num_songs with offset + songs retrieved