Implement audio feature statistics calculation

Implemented the calculation average and standard deviation of audio features.
2018-05-20 16:28:46 -04:00
parent 9878e7954d
commit d7002f7571
1 changed files with 19 additions and 7 deletions
--- a/spotifyvis/views.py
+++ b/spotifyvis/views.py
@@ -168,11 +168,12 @@ def get_audio_features(track_id, headers):
    return features_dict


-def update_std_dev(cur_mean, new_data_point, sample_size):
+def update_std_dev(cur_mean, cur_std_dev, new_data_point, sample_size):
    """Calculates the standard deviation for a sample without storing all data points

    Args:
        cur_mean: the current mean for N = (sample_size - 1)
+        cur_std_dev: the current standard deviation for N = (sample_size - 1)
        new_data_point: a new data point
        sample_size: sample size including the new data point
    
@@ -182,8 +183,12 @@ def update_std_dev(cur_mean, new_data_point, sample_size):
    # This is an implementationof Welford's method
    # http://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/
    new_mean = ((sample_size - 1) * cur_mean + new_data_point) / sample_size
-    std_dev = (new_data_point - new_mean) * (new_data_point - cur_mean)
-    return new_mean, std_dev
+    delta_variance = (new_data_point - new_mean) * (new_data_point - cur_mean)
+    new_std_dev = math.sqrt(
+        (math.pow(cur_std_dev, 2) * (sample_size - 2) + delta_variance) / (
+        sample_size - 1
+    ))
+    return new_mean, new_std_dev


 def update_audio_feature_stats(feature, new_data_point, sample_size):
@@ -203,13 +208,13 @@ def update_audio_feature_stats(feature, new_data_point, sample_size):
            "average": new_data_point,
            "std_dev": 0,
        }
-
    else:
        current_mean = library_stats['audio_features'][feature]['average']
-        updated_mean, std_dev = update_std_dev(current_mean, new_data_point, sample_size)
+        cur_std_dev = library_stats['audio_features'][feature]['std_dev']
+        updated_mean, new_std_dev = update_std_dev(current_mean, cur_std_dev, new_data_point, sample_size)

        library_stats['audio_features'][feature]['average'] = updated_mean
-        library_stats['audio_features'][feature]['std_dev'] = std_dev
+        library_stats['audio_features'][feature]['std_dev'] = new_std_dev


 #  parse_library {{{ # 
@@ -228,12 +233,19 @@ def parse_library(headers, tracks):
    # keeps track of point to get songs from
    offset = 0
    payload = {'limit': str(limit)}
-    for i in range(0, tracks, limit):
+    for _ in range(0, tracks, limit):
        payload['offset'] = str(offset)
        saved_tracks_response = requests.get('https://api.spotify.com/v1/me/tracks', headers=headers, params=payload).json()
+        num_samples = offset
        for track_dict in saved_tracks_response['items']:
+            # Track the number of samples for calculating
+            # audio feature averages and standard deviations on the fly
+            num_samples += 1 
            get_track_info(track_dict['track'])
            #  get_genre(headers, track_dict['track']['album']['id'])
+            audio_features_dict = get_audio_features(track_dict['id'], headers)
+            for feature, feature_data in audio_features_dict.items():
+                update_audio_feature_stats(feature, feature_data, num_samples)
            for artist_dict in track_dict['track']['artists']:
                increase_artist_count(headers, artist_dict['name'], artist_dict['id'])
        # calculates num_songs with offset + songs retrieved