Notes#
Show imports
import os
from collections import defaultdict, Counter
from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import STD_LAYOUT, CADENCE_COLORS, CORPUS_COLOR_SCALE, chronological_corpus_order, color_background, get_corpus_display_name, get_repo_name, resolve_dir, value_count_df, get_repo_name, print_heading, resolve_dir
Show source
CORPUS_PATH = os.path.abspath(os.path.join('..', '..'))
ANNOTATED_ONLY = os.getenv("ANNOTATED_ONLY", "True").lower() in ('true', '1', 't')
print_heading("Notebook settings")
print(f"CORPUS_PATH: {CORPUS_PATH!r}")
print(f"ANNOTATED_ONLY: {ANNOTATED_ONLY}")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
Notebook settings
-----------------
CORPUS_PATH: '/home/runner/work/workflow_deployment/corelli'
ANNOTATED_ONLY: False
Show source
repo = Repo(CORPUS_PATH)
print_heading("Data and software versions")
print(f"Data repo '{get_repo_name(repo)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Data and software versions
--------------------------
Data repo 'corelli' @ c3a2358
dimcat version 0.3.0
ms3 version 2.5.2
dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
[default|all]
All corpora
-----------
View: This view is called 'default'. It
- excludes pieces that are not contained in the metadata,
- filters out file extensions requiring conversion (such as .xml), and
- excludes review files and folders.
has active scores measures notes expanded chords
metadata view detected detected parsed detected parsed detected parsed detected parsed
corpus
corelli yes default 149 149 149 149 149 149 149 149 149
1043/3427 files are excluded from this view.
1043 files have been excluded based on their subdir.
N = 149 annotated pieces, 596 parsed dataframes.
Metadata#
all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:,:20]
Concatenated 'metadata.tsv' files cover 149 of the 149 scores.
piece | TimeSig | KeySig | last_mc | last_mn | length_qb | last_mc_unfolded | last_mn_unfolded | length_qb_unfolded | volta_mcs | all_notes_qb | n_onsets | n_onset_positions | guitar_chord_count | form_label_count | label_count | annotated_key | harmony_version | annotators | reviewers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | ||||||||||||||||||||
corelli | op01n01a | {1: '4/4'} | {1: -1} | 14 | 14 | 56.0 | 14 | 14 | 56.0 | 224.0 | 280 | 110 | 0 | 0 | 64 | F | 2.3.0 | Lars Opfermann, Ya-Chuan Wu (2.1.1), Hanné Bec... | HB, JH |
Compute chronological order
chronological_order = chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
chronological_order
['corelli']
all_notes = dataset.data.get_all_parsed('notes', force=True, flat=True)
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
WARNING ms3.Parse.corelli -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
You have set force=True, which forces me to parse 149 scores iteratively. Next time, call _.parse() on me, so we can speed this up!
70322 notes over 149 files.
mc | mn | quarterbeats | quarterbeats_all_endings | duration_qb | mc_onset | mn_onset | timesig | staff | voice | duration | nominal_duration | scalar | tied | tpc | midi | name | octave | chord_id | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | piece | i | |||||||||||||||||||
corelli | op01n01a | 0 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 4/4 | 3 | 1 | 1/4 | 1/4 | 1 | <NA> | -1 | 53 | F3 | 3 | 8 |
1 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 4/4 | 4 | 1 | 1/4 | 1/4 | 1 | <NA> | -1 | 53 | F3 | 3 | 14 | ||
2 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 4/4 | 2 | 1 | 1/4 | 1/4 | 1 | <NA> | 3 | 81 | A5 | 5 | 4 | ||
3 | 1 | 1 | 0 | 0 | 1.0 | 0 | 0 | 4/4 | 1 | 1 | 1/4 | 1/4 | 1 | <NA> | 0 | 84 | C6 | 6 | 0 | ||
4 | 1 | 1 | 1 | 1 | 1.0 | 1/4 | 1/4 | 4/4 | 3 | 1 | 1/4 | 1/4 | 1 | <NA> | 1 | 55 | G3 | 3 | 9 |
def weight_notes(nl, group_col='midi', precise=True):
summed_durations = nl.groupby(group_col).duration_qb.sum()
shortest_duration = summed_durations[summed_durations > 0].min()
summed_durations /= shortest_duration # normalize such that the shortest duration results in 1 occurrence
if not precise:
# This simple trick reduces compute time but also precision:
# The rationale is to have the smallest value be slightly larger than 0.5 because
# if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
summed_durations /= 1.9999999
return repeat_notes_according_to_weights(summed_durations)
def repeat_notes_according_to_weights(weights):
try:
counts = weights.round().astype(int)
except Exception:
return pd.Series(dtype=int)
counts_reflecting_weights = []
for pitch, count in counts.items():
counts_reflecting_weights.extend([pitch]*count)
return pd.Series(counts_reflecting_weights)
Ambitus#
corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
all_notes['corpus_name'] = all_notes.index.get_level_values(0).map(corpus_names)
grouped_notes = all_notes.groupby('corpus_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi
dataset | midi | |
---|---|---|
0 | Corelli Trio Sonatas | 36 |
1 | Corelli Trio Sonatas | 36 |
2 | Corelli Trio Sonatas | 36 |
3 | Corelli Trio Sonatas | 36 |
4 | Corelli Trio Sonatas | 36 |
... | ... | ... |
16144 | Corelli Trio Sonatas | 86 |
16145 | Corelli Trio Sonatas | 86 |
16146 | Corelli Trio Sonatas | 88 |
16147 | Corelli Trio Sonatas | 88 |
16148 | Corelli Trio Sonatas | 88 |
16149 rows × 2 columns
yaxis=dict(tickmode= 'array',
tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
gridcolor='lightgrey',
)
fig = px.violin(weighted_midi,
x='dataset',
y='midi',
color='dataset',
box=True,
labels=dict(
dataset='',
midi='distribution of pitches by duration'
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000, height=600,
)
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
**STD_LAYOUT,
showlegend=False)
fig.show()
Tonal Pitch Classes (TPC)#
weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_tpc.columns = ['dataset', 'tpc']
weighted_tpc
dataset | tpc | |
---|---|---|
0 | Corelli Trio Sonatas | -6 |
1 | Corelli Trio Sonatas | -5 |
2 | Corelli Trio Sonatas | -5 |
3 | Corelli Trio Sonatas | -5 |
4 | Corelli Trio Sonatas | -5 |
... | ... | ... |
129164 | Corelli Trio Sonatas | 12 |
129165 | Corelli Trio Sonatas | 12 |
129166 | Corelli Trio Sonatas | 12 |
129167 | Corelli Trio Sonatas | 12 |
129168 | Corelli Trio Sonatas | 12 |
129169 rows × 2 columns
As violin plot#
yaxis=dict(
tickmode= 'array',
tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
gridcolor='lightgrey',
zerolinecolor='lightgrey',
zeroline=True
)
fig = px.violin(weighted_tpc,
x='dataset',
y='tpc',
color='dataset',
box=True,
labels=dict(
dataset='',
tpc='distribution of tonal pitch classes by duration'
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000,
height=600,
)
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
**STD_LAYOUT,
showlegend=False)
fig.show()
As bar plots#
bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
labels=dict(tpc='Named pitch class',
duration_qb='Duration in quarter notes'
),
color_discrete_sequence=CORPUS_COLOR_SCALE,
width=1000, height=300,
)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
minor=dict(dtick=6, gridcolor='grey', showgrid=True),
)
fig.show()
scatter_data = all_notes.groupby(['corpus_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.bar(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
labels=dict(
duration_qb='duration',
tpc='named pitch class',
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
width=1000, height=500,
)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
minor=dict(dtick=6, gridcolor='grey', showgrid=True),
)
fig.show()
As scatter plots#
fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
labels=dict(
duration_qb='duration',
tpc='named pitch class',
),
category_orders=dict(dataset=chronological_corpus_names),
color_discrete_map=corpus_name_colors,
facet_col='corpus_name', facet_col_wrap=3, facet_col_spacing=0.03,
width=1000, height=1000,
)
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.show()
no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()
entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"
'Fraction of note duration without accidental of the entire durations: 49521.083333333336 / 64585.083333333336 = 0.7667572878670385'
Notes and staves#
print("Distribution of notes over staves:")
value_count_df(all_notes.staff)
Distribution of notes over staves:
counts | % | |
---|---|---|
staff | ||
1 | 20815 | 0.295996 |
2 | 17593 | 0.250178 |
3 | 16765 | 0.238403 |
4 | 15149 | 0.215423 |