Notes#

Hide imports
import os

import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
from dimcat import filters, plotting

import utils

pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 500)
Hide source
RESULTS_PATH = os.path.abspath(os.path.join(utils.OUTPUT_FOLDER, "notes_stats"))
os.makedirs(RESULTS_PATH, exist_ok=True)


def make_output_path(
    filename: str,
    extension=None,
    path=RESULTS_PATH,
) -> str:
    return utils.make_output_path(filename=filename, extension=extension, path=path)


def save_figure_as(
    fig, filename, formats=("png", "pdf"), directory=RESULTS_PATH, **kwargs
):
    if formats is not None:
        for fmt in formats:
            plotting.write_image(fig, filename, directory, format=fmt, **kwargs)
    else:
        plotting.write_image(fig, filename, directory, **kwargs)

Loading data

Hide source
D = utils.get_dataset("pleyel_quartets", corpus_release="v2.5")
package = D.inputs.get_package()
package_info = package._package.custom
git_tag = package_info.get("git_tag")
utils.print_heading("Data and software versions")
print("Ignaz Pleyel – String Quartets version v2.5")
print(f"Datapackage '{package.package_name}' @ {git_tag}")
print(f"dimcat version {dc.__version__}\n")
D
Data and software versions
--------------------------

Ignaz Pleyel – String Quartets version v2.5
Datapackage 'pleyel_quartets' @ v2.5
dimcat version 3.4.0
Dataset
=======
{'inputs': {'basepath': None,
            'packages': {'pleyel_quartets': ["'pleyel_quartets.measures' (MuseScoreFacetName.MuseScoreMeasures)",
                                             "'pleyel_quartets.notes' (MuseScoreFacetName.MuseScoreNotes)",
                                             "'pleyel_quartets.expanded' (MuseScoreFacetName.MuseScoreHarmonies)",
                                             "'pleyel_quartets.chords' (MuseScoreFacetName.MuseScoreChords)",
                                             "'pleyel_quartets.metadata' (FeatureName.Metadata)"]}},
 'outputs': {'basepath': None, 'packages': {}},
 'pipeline': []}

Metadata#

filtered_D = filters.HasHarmonyLabelsFilter(keep_values=[True]).process(D)

all_metadata = filtered_D.get_metadata()
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:, :20]
piece TimeSig KeySig last_mc last_mn length_qb last_mc_unfolded last_mn_unfolded length_qb_unfolded volta_mcs all_notes_qb n_onsets n_onset_positions guitar_chord_count form_label_count label_count annotated_key harmony_version annotators reviewers
corpus
pleyel_quartets b307op2n1a {1: '4/4'} {1: 3} 199 197 788.0 398 394 1576.0 (87], [88) 2674.5 3621 1599 0 0 402 A 2.3.0 Adrian Nagel (2.1.0), Davor Krkljus (2.3.0) DK, AN
chronological_order = utils.chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, utils.CORPUS_COLOR_SCALE))
notes_feature = filtered_D.get_feature("notes")
all_notes = notes_feature.df
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
13972 notes over 6 files.
mc mn quarterbeats quarterbeats_all_endings duration_qb duration mc_onset mn_onset timesig staff voice volta chord_id gracenote midi name nominal_duration octave scalar tied tremolo tpc_name tpc
corpus piece i
pleyel_quartets b307op2n1a 0 1 0 0 0 0.50 1/8 0 3/4 4/4 2 1 <NA> 2 <NA> 61 C#4 1/8 4 1 <NA> <NA> C# 7
1 1 0 0 0 0.50 1/8 0 3/4 4/4 1 1 <NA> 0 <NA> 69 A4 1/8 4 1 <NA> <NA> A 3
2 1 0 3/4 3/4 0.25 1/16 3/16 15/16 4/4 2 1 <NA> 3 <NA> 64 E4 1/16 4 1 <NA> <NA> E 4
3 1 0 3/4 3/4 0.25 1/16 3/16 15/16 4/4 1 1 <NA> 1 <NA> 71 B4 1/16 4 1 <NA> <NA> B 5
4 2 1 1 1 0.00 0 0 0 4/4 1 1 <NA> 4 acciaccatura 74 D5 1/8 5 1 <NA> <NA> D 2
def weight_notes(nl, group_col="midi", precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    shortest_duration = summed_durations[summed_durations > 0].min()
    summed_durations /= shortest_duration  # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)


def repeat_notes_according_to_weights(weights):
    try:
        counts = weights.round().astype(int)
    except Exception:
        return pd.Series(dtype=int)
    counts_reflecting_weights = []
    for pitch, count in counts.items():
        counts_reflecting_weights.extend([pitch] * count)
    return pd.Series(counts_reflecting_weights)

Ambitus#

corpus_names = {
    corp: utils.get_corpus_display_name(corp) for corp in chronological_order
}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {
    corpus_names[corp]: color for corp, color in corpus_colors.items()
}
all_notes["corpus_name"] = all_notes.index.get_level_values(0).map(corpus_names)
grouped_notes = all_notes.groupby("corpus_name")
weighted_midi = pd.concat(
    [weight_notes(nl, "midi", precise=False) for _, nl in grouped_notes],
    keys=grouped_notes.groups.keys(),
).reset_index(level=0)
weighted_midi.columns = ["dataset", "midi"]
weighted_midi
dataset midi
0 Pleyel Quartets 36
1 Pleyel Quartets 36
2 Pleyel Quartets 36
3 Pleyel Quartets 36
4 Pleyel Quartets 36
... ... ...
4105 Pleyel Quartets 88
4106 Pleyel Quartets 90
4107 Pleyel Quartets 90
4108 Pleyel Quartets 90
4109 Pleyel Quartets 93

4110 rows × 2 columns

# fig = px.violin(weighted_midi,
#                 x='dataset',
#                 y='midi',
#                 color='dataset',
#                 title="Corpus-wise distribution over registers (ambitus)",
#                 box=True,
#                 labels=dict(
#                     dataset='',
#                     midi='distribution of pitches by duration'
#                 ),
#                 category_orders=dict(dataset=chronological_corpus_names),
#                 color_discrete_map=corpus_name_colors,
#                 width=1000, height=600,
#                )
# fig.update_traces(spanmode='hard') # do not extend beyond outliers
# fig.update_layout(**utils.STD_LAYOUT,
#                  showlegend=False)
# fig.update_yaxes(
#     tickmode= 'array',
#     tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
#     ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
# )
# fig.update_xaxes(tickangle=45)
# save_figure_as(fig, "ambitus_corpuswise_violins")
# fig.show()

Tonal Pitch Classes (TPC)#

weighted_tpc = pd.concat(
    [weight_notes(nl, "tpc") for _, nl in grouped_notes],
    keys=grouped_notes.groups.keys(),
).reset_index(level=0)
weighted_tpc.columns = ["dataset", "tpc"]
weighted_tpc
dataset tpc
0 Pleyel Quartets -5
1 Pleyel Quartets -5
2 Pleyel Quartets -5
3 Pleyel Quartets -5
4 Pleyel Quartets -5
... ... ...
10264 Pleyel Quartets 12
10265 Pleyel Quartets 12
10266 Pleyel Quartets 12
10267 Pleyel Quartets 12
10268 Pleyel Quartets 13

10269 rows × 2 columns

As violin plot#

# fig = px.violin(weighted_tpc,
#                 x='dataset',
#                 y='tpc',
#                 color='dataset',
#                 title="Corpus-wise distribution over line of fifths (tonal pitch classes)",
#                 box=True,
#                 labels=dict(
#                     dataset='',
#                     tpc='distribution of tonal pitch classes by duration'
#                 ),
#                 category_orders=dict(dataset=chronological_corpus_names),
#                 color_discrete_map=corpus_name_colors,
#                 width=1000,
#                 height=600,
#                )
# fig.update_traces(spanmode='hard') # do not extend beyond outliers
# fig.update_layout(**utils.STD_LAYOUT,
#                  showlegend=False)
# fig.update_yaxes(
#     tickmode= 'array',
#     tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
#     ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
#     zerolinecolor='grey',
#     zeroline=True
# )
# fig.update_xaxes(tickangle=45)
# save_figure_as(fig, "pitch_class_distributions_corpuswise_violins")
# fig.show()
(all_notes)
mc mn quarterbeats quarterbeats_all_endings duration_qb duration mc_onset mn_onset timesig staff voice volta chord_id gracenote midi name nominal_duration octave scalar tied tremolo tpc_name tpc corpus_name
corpus piece i
pleyel_quartets b307op2n1a 0 1 0 0 0 0.50 1/8 0 3/4 4/4 2 1 <NA> 2 <NA> 61 C#4 1/8 4 1 <NA> <NA> C# 7 Pleyel Quartets
1 1 0 0 0 0.50 1/8 0 3/4 4/4 1 1 <NA> 0 <NA> 69 A4 1/8 4 1 <NA> <NA> A 3 Pleyel Quartets
2 1 0 3/4 3/4 0.25 1/16 3/16 15/16 4/4 2 1 <NA> 3 <NA> 64 E4 1/16 4 1 <NA> <NA> E 4 Pleyel Quartets
3 1 0 3/4 3/4 0.25 1/16 3/16 15/16 4/4 1 1 <NA> 1 <NA> 71 B4 1/16 4 1 <NA> <NA> B 5 Pleyel Quartets
4 2 1 1 1 0.00 0 0 0 4/4 1 1 <NA> 4 acciaccatura 74 D5 1/8 5 1 <NA> <NA> D 2 Pleyel Quartets
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
b309op2n3c 1048 74 74 219 219 2.00 1/2 0 0 3/4 3 1 <NA> 1024 <NA> 59 B3 1/2 3 1 <NA> <NA> B 5 Pleyel Quartets
1049 74 74 219 219 2.00 1/2 0 0 3/4 1 1 <NA> 1022 <NA> 67 G4 1/2 4 1 <NA> <NA> G 1 Pleyel Quartets
1050 74 74 219 219 2.00 1/2 0 0 3/4 2 1 <NA> 1023 <NA> 67 G4 1/2 4 1 <NA> <NA> G 1 Pleyel Quartets
1051 74 74 220 220 1.00 1/4 1/4 1/4 3/4 4 1 <NA> 1026 <NA> 50 D3 1/4 3 1 <NA> <NA> D 2 Pleyel Quartets
1052 74 74 221 221 1.00 1/4 1/2 1/2 3/4 4 1 <NA> 1027 <NA> 43 G2 1/4 2 1 <NA> <NA> G 1 Pleyel Quartets

13972 rows × 24 columns

width = 1400
height = 800

weighted_pitch_values = pd.concat(
    [
        weighted_midi.rename(columns={"midi": "value"}),
        weighted_tpc.rename(columns={"tpc": "value"}),
    ],
    keys=["MIDI pitch", "Tonal pitch class"],
    names=["distribution"],
).reset_index(level=[0, 1])

fig = plotting.make_violin_plot(
    weighted_pitch_values,
    x_col="dataset",
    y_col="value",
    color="dataset",
    facet_row="distribution",
    box=True,
    labels=dict(dataset="", tpc="distribution of tonal pitch classes by duration"),
    category_orders=dict(dataset=chronological_corpus_names),
    # color_discrete_map=corpus_name_colors,
    color_discrete_sequence=px.colors.qualitative.Dark24,
    traces_settings=dict(
        spanmode="hard",
        width=1,
        # scalemode='width'
    ),
    layout=dict(
        showlegend=False,
        margin=dict(
            t=0,
            b=0,
            l=0,
            r=0,
        ),
    ),
    x_axis=dict(
        # tickangle=45,
        tickfont_size=15
    ),
    y_axis=dict(
        tickmode="array",
        tickvals=[-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 24, 36, 48, 60, 72, 84, 96],
        ticktext=[
            "Dbb",
            "Bbb",
            "Gb",
            "Eb",
            "C",
            "A",
            "F#",
            "D#",
            "B#",
            "G##",
            "C1",
            "C2",
            "C3",
            "C4",
            "C5",
            "C6",
            "C7",
        ],
        zerolinecolor="grey",
        zeroline=True,
    ),
    width=width,
    height=height,
)
utils.realign_subplot_axes(fig, y_axes=dict(title_text=""))
save_figure_as(fig, "notes_violin", width=width, height=height)
fig
fig = plotting.make_box_plot(
    weighted_pitch_values,
    x_col="dataset",
    y_col="value",
    color="dataset",
    facet_row="distribution",
    # box=True,
    labels=dict(dataset="", tpc="distribution of tonal pitch classes by duration"),
    category_orders=dict(dataset=chronological_corpus_names),
    # color_discrete_map=corpus_name_colors,
    color_discrete_sequence=px.colors.qualitative.Light24,
    # traces_settings=dict(spanmode='hard'),
    layout=dict(showlegend=False, margin=dict(t=0)),
    x_axis=dict(tickangle=45, tickfont_size=15),
    y_axis=dict(
        tickmode="array",
        tickvals=[-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 24, 36, 48, 60, 72, 84, 96],
        ticktext=[
            "Dbb",
            "Bbb",
            "Gb",
            "Eb",
            "C",
            "A",
            "F#",
            "D#",
            "B#",
            "G##",
            "C1",
            "C2",
            "C3",
            "C4",
            "C5",
            "C6",
            "C7",
        ],
        zerolinecolor="grey",
        zeroline=True,
    ),
    width=width,
    height=height,
)
utils.realign_subplot_axes(fig, y_axes=True)
save_figure_as(fig, "notes_box", width=width, height=height)
fig

As bar plots#

bar_data = all_notes.groupby("tpc").duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max() + 1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(
    bar_data,
    x="tpc",
    y="duration_qb",
    labels=dict(tpc="Named pitch class", duration_qb="Duration in quarter notes"),
    color_discrete_sequence=utils.CORPUS_COLOR_SCALE,
    width=1000,
    height=300,
)
fig.update_layout(**utils.STD_LAYOUT)
fig.update_xaxes(
    zerolinecolor="grey",
    tickmode="array",
    tickvals=x_values,
    ticktext=x_names,
    dtick=1,
    ticks="outside",
    tickcolor="black",
    minor=dict(dtick=6, gridcolor="grey", showgrid=True),
)
save_figure_as(fig, "pitch_class_distribution_absolute_bars")
fig.show()
scatter_data = all_notes.groupby(["corpus_name", "tpc"]).duration_qb.sum().reset_index()
fig = px.bar(
    scatter_data,
    x="tpc",
    y="duration_qb",
    color="corpus_name",
    labels=dict(
        duration_qb="duration",
        tpc="named pitch class",
    ),
    category_orders=dict(dataset=chronological_corpus_names),
    color_discrete_map=corpus_name_colors,
    width=1000,
    height=500,
)
fig.update_layout(**utils.STD_LAYOUT)
fig.update_xaxes(
    zerolinecolor="grey",
    tickmode="array",
    tickvals=x_values,
    ticktext=x_names,
    dtick=1,
    ticks="outside",
    tickcolor="black",
    minor=dict(dtick=6, gridcolor="grey", showgrid=True),
)
save_figure_as(fig, "pitch_class_distribution_corpuswise_absolute_bars")
fig.show()

As scatter plots#

fig = px.scatter(
    scatter_data,
    x="tpc",
    y="duration_qb",
    color="corpus_name",
    labels=dict(
        duration_qb="duration",
        tpc="named pitch class",
    ),
    category_orders=dict(dataset=chronological_corpus_names),
    color_discrete_map=corpus_name_colors,
    facet_col="corpus_name",
    facet_col_wrap=3,
    facet_col_spacing=0.03,
    width=1000,
    height=1000,
)
fig.update_traces(mode="lines+markers")
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**utils.STD_LAYOUT, showlegend=False)
fig.update_xaxes(
    zerolinecolor="grey",
    tickmode="array",
    tickvals=[-12, -6, 0, 6, 12, 18],
    ticktext=["Dbb", "Gb", "C", "F#", "B#", "E##"],
    visible=True,
)
fig.update_yaxes(zeroline=False, matches=None, showticklabels=True)
save_figure_as(fig, "pitch_class_distribution_corpuswise_scatter")
fig.show()
no_accidental = bar_data[bar_data.tpc.between(-1, 5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1, 5)].duration_qb.sum()
entire = no_accidental + with_accidental
(
    f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = "
    f"{no_accidental / entire}"
)
'Fraction of note duration without accidental of the entire durations: 7325.604166666667 / 10270.416666666668 = 0.713272343705627'

Notes and staves#

print("Distribution of notes over staves:")
utils.value_count_df(all_notes.staff)
Distribution of notes over staves:
counts %
staff
1 4741 33.93
2 3630 25.98
3 3056 21.87
4 2545 18.22