Notes#

Hide imports
import os
from collections import defaultdict, Counter

from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils import STD_LAYOUT, CADENCE_COLORS, CORPUS_COLOR_SCALE, chronological_corpus_order, color_background, get_corpus_display_name, get_repo_name, resolve_dir, value_count_df, get_repo_name, print_heading, resolve_dir
Hide source
CORPUS_PATH = os.path.abspath(os.path.join('..', '..'))
ANNOTATED_ONLY = os.getenv("ANNOTATED_ONLY", "True").lower() in ('true', '1', 't')
print_heading("Notebook settings")
print(f"CORPUS_PATH: {CORPUS_PATH!r}")
print(f"ANNOTATED_ONLY: {ANNOTATED_ONLY}")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
Notebook settings
-----------------

CORPUS_PATH: '/home/runner/work/workflow_deployment/debussy_piano'
ANNOTATED_ONLY: False
Hide source
repo = Repo(CORPUS_PATH)
print_heading("Data and software versions")
print(f"Data repo '{get_repo_name(repo)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Data and software versions
--------------------------

Data repo 'debussy_piano' @ fd3b785
dimcat version 0.3.0
ms3 version 2.5.2
dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
[default|all]
All corpora
-----------
View: This view is called 'default'. It 
	- excludes pieces that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

                                has   active   scores measures           notes        expanded       
                           metadata     view detected detected parsed detected parsed detected parsed
corpus                                                                                               
debussy_childrens_corner        yes  default        6        6      6        6      6        0      0
debussy_deux_arabesques         yes  default        2        2      2        2      2        0      0
debussy_estampes                yes  default        3        3      3        3      3        0      0
debussy_etudes                  yes  default       12       12     12       12     12        0      0
debussy_images                  yes  default        9        9      9        9      9        0      0
debussy_other_piano_pieces      yes  default       19       19     19       19     19        0      0
debussy_pour_le_piano           yes  default        3        3      3        3      3        0      0
debussy_preludes                yes  default       24       24     24       24     24        0      0
debussy_suite_bergamasque       yes  default        4        4      4        4      4        4      4
publication_data_and_code        no  default        0        0      0        0      0        0      0

28/860 files are excluded from this view.

28 files have been excluded based on their subdir.
N = 82 annotated pieces, 168 parsed dataframes.

Metadata#

all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:,:20]
Concatenated 'metadata.tsv' files cover 82 of the 82 scores.
piece TimeSig KeySig last_mc last_mn length_qb last_mc_unfolded last_mn_unfolded length_qb_unfolded all_notes_qb n_onsets n_onset_positions guitar_chord_count form_label_count label_count composed_start composed_end composer workTitle movementNumber
corpus
debussy_childrens_corner l113-01_childrens_doctor {1: '4/4'} {1: 0, 33: -2, 37: -4, 45: 0} 76.0 76.0 304.00 76.0 76.0 304.00 707.25 1259.0 1033.0 0.0 0.0 0.0 1906.0 1908.0 Claude Debussy NaN
debussy_deux_arabesques l066-01_arabesques_premiere {1: '4/4', 94: '2/4', 95: '4/4'} {1: 4, 39: 3, 71: 4} 107.0 107.0 426.00 107.0 107.0 426.00 1207.83 1484.0 1018.0 0.0 0.0 0.0 1888.0 1888.0 Claude Debussy Premiere Arabesque
debussy_estampes l100-01_estampes_pagode {1: '4/4', 92: '2/4', 93: '4/4', 94: '2/4', 95... {1: 5} 98.0 98.0 388.00 98.0 98.0 388.00 1913.08 2733.0 1486.0 0.0 0.0 0.0 1903.0 1903.0 Claude Debussy NaN
debussy_etudes l136-01_etudes_cinq {1: '4/4', 6: '2/4', 7: '6/16', 11: '4/4', 17:... {1: 0, 48: -7, 56: 0, 75: -4, 90: 0, 111: -5, ... 116.0 116.0 317.00 116.0 116.0 317.00 682.37 2067.0 1354.0 0.0 0.0 0.0 1915.0 1915.0 Claude Debussy Étude no.1: pour les cinq doigts
debussy_images l087-01_images_lent {1: '3/4', 21: '4/4', 22: '3/4'} {1: 3, 32: 6} 57.0 57.0 172.00 57.0 57.0 172.00 968.33 1229.0 354.0 0.0 0.0 0.0 1894.0 1894.0 Claude Debussy 1
debussy_other_piano_pieces l000_etude {1: '4/4'} {1: -4, 7: 0, 9: -5, 38: 0, 42: -4, 48: -3, 57... 73.0 71.0 284.00 73.0 71.0 284.00 959.25 2473.0 1901.0 0.0 0.0 0.0 1915.0 1915.0 Claude Debussy Etude Retrouve
debussy_pour_le_piano l095-01_pour_prelude {1: '3/4'} {1: 0} 163.0 163.0 536.12 163.0 163.0 536.12 1882.87 3349.0 1965.0 0.0 0.0 0.0 1901.0 1901.0 Claude Debussy NaN
debussy_preludes l117-01_preludes_danseuses {1: '3/4'} {1: -2} 31.0 31.0 97.00 31.0 31.0 97.00 756.00 822.0 171.0 0.0 0.0 0.0 1909.0 1909.0 NaN NaN
debussy_suite_bergamasque l075-01_suite_prelude {1: '4/4'} {1: -1} 89.0 89.0 356.00 89.0 89.0 356.00 1533.67 1721.0 870.0 0.0 0.0 274.0 1890.0 1905.0 Claude Debussy Suite Bergamasque 1

Compute chronological order

chronological_order = chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
chronological_order
['debussy_deux_arabesques',
 'debussy_pour_le_piano',
 'debussy_images',
 'debussy_other_piano_pieces',
 'debussy_estampes',
 'debussy_suite_bergamasque',
 'debussy_childrens_corner',
 'debussy_preludes',
 'debussy_etudes']
all_notes = dataset.data.get_all_parsed('notes', force=True, flat=True)
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
WARNING  ms3.Parse.debussy_etudes -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
	You have set force=True, which forces me to parse 12 scores iteratively. Next time, call _.parse() on me, so we can speed this up!
WARNING  ms3.Parse.debussy_other_piano_pieces -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
	You have set force=True, which forces me to parse 19 scores iteratively. Next time, call _.parse() on me, so we can speed this up!
WARNING  ms3.Parse.debussy_preludes -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
	You have set force=True, which forces me to parse 24 scores iteratively. Next time, call _.parse() on me, so we can speed this up!
149385 notes over 82 files.
mc mn quarterbeats duration_qb mc_onset mn_onset timesig staff voice duration ... nominal_duration scalar tied tpc midi name octave chord_id tremolo quarterbeats_all_endings
corpus piece i
debussy_childrens_corner l113-01_childrens_doctor 0 1 1 0 0.00 0 0 4/4 2 1 0 ... 1/8 1 <NA> 0 36 C2 2 15 NaN NaN
1 1 1 0 4.00 0 0 4/4 2 1 1 ... 1 1 1 0 48 C3 3 16 NaN NaN
2 1 1 1/4 0.25 1/16 1/16 4/4 1 1 1/16 ... 1/16 1 <NA> 1 55 G3 3 0 NaN NaN
3 1 1 1/2 0.25 1/8 1/8 4/4 1 1 1/16 ... 1/16 1 <NA> 0 60 C4 4 1 NaN NaN
4 1 1 3/4 0.25 3/16 3/16 4/4 1 1 1/16 ... 1/16 1 <NA> 2 62 D4 4 2 NaN NaN

5 rows × 21 columns

def weight_notes(nl, group_col='midi', precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    shortest_duration = summed_durations[summed_durations > 0].min()
    summed_durations /= shortest_duration # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)
    
def repeat_notes_according_to_weights(weights):
    try:
        counts = weights.round().astype(int)
    except Exception:
        return pd.Series(dtype=int)
    counts_reflecting_weights = []
    for pitch, count in counts.items():
        counts_reflecting_weights.extend([pitch]*count)
    return pd.Series(counts_reflecting_weights)

Ambitus#

corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
all_notes['corpus_name'] = all_notes.index.get_level_values(0).map(corpus_names)
grouped_notes = all_notes.groupby('corpus_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi
dataset midi
0 Debussy Childrens Corner 22
1 Debussy Childrens Corner 22
2 Debussy Childrens Corner 24
3 Debussy Childrens Corner 24
4 Debussy Childrens Corner 27
... ... ...
6089 Debussy Suite Bergamasque 97
6090 Debussy Suite Bergamasque 97
6091 Debussy Suite Bergamasque 97
6092 Debussy Suite Bergamasque 97
6093 Debussy Suite Bergamasque 97

73236 rows × 2 columns

yaxis=dict(tickmode= 'array',
           tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
           ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
           gridcolor='lightgrey',
           )
fig = px.violin(weighted_midi, 
                x='dataset', 
                y='midi', 
                color='dataset', 
                box=True,
                labels=dict(
                    dataset='',
                    midi='distribution of pitches by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000, height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis, 
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

Tonal Pitch Classes (TPC)#

weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_tpc.columns = ['dataset', 'tpc']
weighted_tpc
dataset tpc
0 Debussy Childrens Corner -8
1 Debussy Childrens Corner -8
2 Debussy Childrens Corner -8
3 Debussy Childrens Corner -7
4 Debussy Childrens Corner -7
... ... ...
2431 Debussy Suite Bergamasque 12
2432 Debussy Suite Bergamasque 12
2433 Debussy Suite Bergamasque 12
2434 Debussy Suite Bergamasque 12
2435 Debussy Suite Bergamasque 13

189783 rows × 2 columns

As violin plot#

yaxis=dict(
    tickmode= 'array',
    tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
    ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
    gridcolor='lightgrey',
    zerolinecolor='lightgrey',
    zeroline=True
           )
fig = px.violin(weighted_tpc, 
                x='dataset', 
                y='tpc', 
                color='dataset', 
                box=True,
                labels=dict(
                    dataset='',
                    tpc='distribution of tonal pitch classes by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000, 
                height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis, 
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

As bar plots#

bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
             labels=dict(tpc='Named pitch class',
                             duration_qb='Duration in quarter notes'
                            ),
             color_discrete_sequence=CORPUS_COLOR_SCALE,
             width=1000, height=300,
             )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array', 
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black', 
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()
scatter_data = all_notes.groupby(['corpus_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.bar(scatter_data, x='tpc', y='duration_qb', color='corpus_name', 
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 width=1000, height=500,
                )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array', 
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black', 
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()

As scatter plots#

fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='corpus_name', 
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 facet_col='corpus_name', facet_col_wrap=3, facet_col_spacing=0.03,
                 width=1000, height=1000,
                )
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
    ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.show()
no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()
entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"
'Fraction of note duration without accidental of the entire durations: 49164.03236540378 / 93841.0640804049 = 0.5239074476316579'

Notes and staves#

print("Distribution of notes over staves:")
value_count_df(all_notes.staff)
Distribution of notes over staves:
counts %
staff
1 80174 0.536694
2 64335 0.430666
3 4876 0.03264