Notes#
Show imports
import os
from collections import defaultdict, Counter
from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import STD_LAYOUT, CADENCE_COLORS, CORPUS_COLOR_SCALE, chronological_corpus_order, color_background, get_corpus_display_name, get_repo_name, resolve_dir, value_count_df, get_repo_name, print_heading, resolve_dir
Show source
CORPUS_PATH = os.path.abspath(os.path.join('..', '..'))
ANNOTATED_ONLY = os.getenv("ANNOTATED_ONLY", "True").lower() in ('true', '1', 't')
print_heading("Notebook settings")
print(f"CORPUS_PATH: {CORPUS_PATH!r}")
print(f"ANNOTATED_ONLY: {ANNOTATED_ONLY}")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
Notebook settings
-----------------
CORPUS_PATH: '/home/runner/work/workflow_deployment/ABC'
ANNOTATED_ONLY: False
Show source
repo = Repo(CORPUS_PATH)
print_heading("Data and software versions")
print(f"Data repo '{get_repo_name(repo)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Data and software versions
--------------------------
Data repo 'ABC' @ 4eb646a
dimcat version 0.3.0
ms3 version 2.5.2
dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
[default|all]
All corpora
-----------
View: This view is called 'default'. It
- excludes pieces that are not contained in the metadata,
- filters out file extensions requiring conversion (such as .xml), and
- excludes review files and folders.
has active scores measures notes expanded chords
metadata view detected detected parsed detected parsed detected parsed detected parsed
corpus
ABC yes default 70 70 70 70 70 70 70 70 70
490/1610 files are excluded from this view.
490 files have been excluded based on their subdir.
N = 70 annotated pieces, 280 parsed dataframes.
Metadata#
all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:,:20]
Concatenated 'metadata.tsv' files cover 70 of the 70 scores.
piece | TimeSig | KeySig | last_mc | last_mn | length_qb | last_mc_unfolded | last_mn_unfolded | length_qb_unfolded | volta_mcs | all_notes_qb | n_onsets | n_onset_positions | guitar_chord_count | form_label_count | label_count | annotated_key | harmony_version | annotators | reviewers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | ||||||||||||||||||||
ABC | n01op18-1_01 | {1: '3/4'} | {1: -1} | 313 | 313 | 939.0 | 427 | 427 | 1281.0 | 3132.75 | 4589 | 1950 | 0 | 0 | 405 | F | 1.0.0 | Markus Neuwirth | NaN |
Compute chronological order
chronological_order = chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
chronological_order
['ABC']
all_notes = dataset.data.get_all_parsed('notes', force=True, flat=True)
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
WARNING ms3.Parse.ABC -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
You have set force=True, which forces me to parse 70 scores iteratively. Next time, call _.parse() on me, so we can speed this up!
241301 notes over 70 files.
mc | mn | quarterbeats | quarterbeats_all_endings | duration_qb | mc_onset | mn_onset | timesig | staff | voice | ... | nominal_duration | scalar | tied | tpc | midi | name | octave | chord_id | tremolo | volta | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | piece | i | |||||||||||||||||||||
ABC | n01op18-1_01 | 0 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 3/4 | 3 | 1 | ... | 1/4 | 1 | 1 | -1 | 53 | F3 | 3 | 12 | NaN | <NA> |
1 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 3/4 | 4 | 1 | ... | 1/4 | 1 | 1 | -1 | 53 | F3 | 3 | 18 | NaN | <NA> | ||
2 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 3/4 | 1 | 1 | ... | 1/4 | 1 | 1 | -1 | 65 | F4 | 4 | 0 | NaN | <NA> | ||
3 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 3/4 | 2 | 1 | ... | 1/4 | 1 | 1 | -1 | 65 | F4 | 4 | 6 | NaN | <NA> | ||
4 | 1 | 1 | 1 | 1 | 0.5 | 1/4 | 1/4 | 3/4 | 3 | 1 | ... | 1/8 | 1 | -1 | -1 | 53 | F3 | 3 | 13 | NaN | <NA> |
5 rows × 22 columns
def weight_notes(nl, group_col='midi', precise=True):
summed_durations = nl.groupby(group_col).duration_qb.sum()
shortest_duration = summed_durations[summed_durations > 0].min()
summed_durations /= shortest_duration # normalize such that the shortest duration results in 1 occurrence
if not precise:
# This simple trick reduces compute time but also precision:
# The rationale is to have the smallest value be slightly larger than 0.5 because
# if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
summed_durations /= 1.9999999
return repeat_notes_according_to_weights(summed_durations)
def repeat_notes_according_to_weights(weights):
try:
counts = weights.round().astype(int)
except Exception:
return pd.Series(dtype=int)
counts_reflecting_weights = []
for pitch, count in counts.items():
counts_reflecting_weights.extend([pitch]*count)
return pd.Series(counts_reflecting_weights)
Ambitus#
corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
all_notes['corpus_name'] = all_notes.index.get_level_values(0).map(corpus_names)
grouped_notes = all_notes.groupby('corpus_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi
dataset | midi | |
---|---|---|
0 | Beethoven String Quartets | 36 |
1 | Beethoven String Quartets | 36 |
2 | Beethoven String Quartets | 36 |
3 | Beethoven String Quartets | 36 |
4 | Beethoven String Quartets | 36 |
... | ... | ... |
164305 | Beethoven String Quartets | 97 |
164306 | Beethoven String Quartets | 97 |
164307 | Beethoven String Quartets | 97 |
164308 | Beethoven String Quartets | 98 |
164309 | Beethoven String Quartets | 99 |
164310 rows × 2 columns
yaxis=dict(tickmode= 'array',
tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
gridcolor='lightgrey',
)
fig = px.violin(weighted_midi,
x='dataset',
y='midi',
color='dataset',
box=True,
labels=dict(
dataset='',
midi='distribution of pitches by duration'
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000, height=600,
)
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
**STD_LAYOUT,
showlegend=False)
fig.show()
Tonal Pitch Classes (TPC)#
weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_tpc.columns = ['dataset', 'tpc']
weighted_tpc
dataset | tpc | |
---|---|---|
0 | Beethoven String Quartets | -11 |
1 | Beethoven String Quartets | -10 |
2 | Beethoven String Quartets | -10 |
3 | Beethoven String Quartets | -9 |
4 | Beethoven String Quartets | -9 |
... | ... | ... |
164307 | Beethoven String Quartets | 13 |
164308 | Beethoven String Quartets | 13 |
164309 | Beethoven String Quartets | 13 |
164310 | Beethoven String Quartets | 13 |
164311 | Beethoven String Quartets | 14 |
164312 rows × 2 columns
As violin plot#
yaxis=dict(
tickmode= 'array',
tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
gridcolor='lightgrey',
zerolinecolor='lightgrey',
zeroline=True
)
fig = px.violin(weighted_tpc,
x='dataset',
y='tpc',
color='dataset',
box=True,
labels=dict(
dataset='',
tpc='distribution of tonal pitch classes by duration'
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000,
height=600,
)
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
**STD_LAYOUT,
showlegend=False)
fig.show()
As bar plots#
bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
labels=dict(tpc='Named pitch class',
duration_qb='Duration in quarter notes'
),
color_discrete_sequence=CORPUS_COLOR_SCALE,
width=1000, height=300,
)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
minor=dict(dtick=6, gridcolor='grey', showgrid=True),
)
fig.show()
scatter_data = all_notes.groupby(['corpus_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.bar(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
labels=dict(
duration_qb='duration',
tpc='named pitch class',
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000, height=500,
)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
minor=dict(dtick=6, gridcolor='grey', showgrid=True),
)
fig.show()
As scatter plots#
fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
labels=dict(
duration_qb='duration',
tpc='named pitch class',
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
facet_col='corpus_name', facet_col_wrap=3, facet_col_spacing=0.03,
width=1000, height=1000,
)
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.show()
no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()
entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"
'Fraction of note duration without accidental of the entire durations: 110833.24285714285 / 164314.0208333333 = 0.6745209099932076'
Notes and staves#
print("Distribution of notes over staves:")
value_count_df(all_notes.staff)
Distribution of notes over staves:
counts | % | |
---|---|---|
staff | ||
1 | 66680 | 0.276335 |
2 | 63009 | 0.261122 |
3 | 60376 | 0.25021 |
4 | 51236 | 0.212332 |