Overview#

This notebook gives a general overview of the features included in the dataset.

Hide imports
import os
from collections import defaultdict, Counter
from fractions import Fraction

from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils import CADENCE_COLORS, CORPUS_COLOR_SCALE, STD_LAYOUT, TYPE_COLORS, color_background, corpus_mean_composition_years, value_count_df, get_corpus_display_name, get_repo_name, print_heading, resolve_dir
Hide source
CORPUS_PATH = os.path.abspath(os.path.join('..', '..'))
ANNOTATED_ONLY = os.getenv("ANNOTATED_ONLY", "True").lower() in ('true', '1', 't')
print_heading("Notebook settings")
print(f"CORPUS_PATH: {CORPUS_PATH!r}")
print(f"ANNOTATED_ONLY: {ANNOTATED_ONLY}")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
Notebook settings
-----------------

CORPUS_PATH: '/home/runner/work/workflow_deployment/mozart_piano_sonatas'
ANNOTATED_ONLY: False
Hide source
repo = Repo(CORPUS_PATH)
print_heading("Data and software versions")
print(f"Data repo '{get_repo_name(repo)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Data and software versions
--------------------------

Data repo 'mozart_piano_sonatas' @ 4b9724f
dimcat version 0.3.0
ms3 version 2.5.2
dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
[default|all]
All corpora
-----------
View: This view is called 'default'. It 
	- excludes pieces that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

                          has   active   scores measures           notes        expanded          chords       
                     metadata     view detected detected parsed detected parsed detected parsed detected parsed
corpus                                                                                                         
mozart_piano_sonatas      yes  default       54       54     54       54     54       54     54       54     54

378/1250 files are excluded from this view.
4/58 pieces are excluded from this view.

378 files have been excluded based on their subdir.
N = 54 annotated pieces, 216 parsed dataframes.
Hide data loading
all_metadata = dataset.data.metadata()
assert len(all_metadata) > 0, "No pieces selected for analysis."
print(f"Metadata covers {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_notes = dataset.get_facet('notes')
all_measures = dataset.get_facet('measures')
mean_composition_years = corpus_mean_composition_years(all_metadata)
chronological_order = mean_composition_years.index.to_list()
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
Metadata covers 54 of the 54 scores.

Composition dates#

This section relies on the dataset’s metadata.

valid_composed_start = pd.to_numeric(all_metadata.composed_start, errors='coerce')
valid_composed_end = pd.to_numeric(all_metadata.composed_end, errors='coerce')
print(f"Composition dates range from {int(valid_composed_start.min())} {valid_composed_start.idxmin()} "
      f"to {int(valid_composed_end.max())} {valid_composed_end.idxmax()}.")
Composition dates range from 1775 ('mozart_piano_sonatas', 'K279-1') to 1789 ('mozart_piano_sonatas', 'K570-1').

Mean composition years per corpus#

Hide source
summary = all_metadata.copy()
summary.length_qb = all_measures.groupby(level=[0,1]).act_dur.sum() * 4.0
summary = pd.concat([summary,
                     all_notes.groupby(level=[0,1]).size().rename('notes'),
                    ], axis=1)
bar_data = pd.concat([mean_composition_years.rename('year'), 
                      summary.groupby(level='corpus').size().rename('pieces')],
                     axis=1
                    ).reset_index()
fig = px.bar(bar_data, x='year', y='pieces', color='corpus',
             color_discrete_map=corpus_colors,
            )
fig.update_traces(width=5)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_traces(width=5)

Composition years histogram#

Hide source
hist_data = summary.reset_index()
hist_data.corpus = hist_data.corpus.map(corpus_names)
fig = px.histogram(hist_data, x='composed_end', color='corpus',
                   labels=dict(composed_end='decade',
                               count='pieces',
                              ),
                   color_discrete_map=corpus_name_colors,
                  )
fig.update_traces(xbins=dict(
    size=10
))
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.show()

Dimensions#

Overview#

Hide source
corpus_metadata = summary.groupby(level=0)
n_pieces = corpus_metadata.size().rename('pieces')
absolute_numbers = dict(
    measures = corpus_metadata.last_mn.sum(),
    length = corpus_metadata.length_qb.sum(),
    notes = corpus_metadata.notes.sum(),
    labels = corpus_metadata.label_count.sum(),
)
absolute = pd.DataFrame.from_dict(absolute_numbers)
absolute = pd.concat([n_pieces, absolute], axis=1)
sum_row = pd.DataFrame(absolute.sum(), columns=['sum']).T
absolute = pd.concat([absolute, sum_row])
relative = absolute.div(n_pieces, axis=0)
complete_summary = pd.concat([absolute, relative, absolute.iloc[:1,2:].div(absolute.measures, axis=0)], axis=1, keys=['absolute', 'per piece', 'per measure'])
complete_summary = complete_summary.apply(pd.to_numeric).round(2)
complete_summary.index = complete_summary.index.map(dict(corpus_names, sum='sum'))
complete_summary
absolute per piece per measure
pieces measures length notes labels pieces measures length notes labels length notes labels
Mozart Piano Sonatas 54 7488 22408.25 104571 15272 1.0 138.67 414.97 1936.5 282.81 2.99 13.97 2.04
sum 54 7488 22408.25 104571 15272 NaN NaN NaN NaN NaN NaN NaN NaN

Measures#

print(f"{len(all_measures.index)} measures over {len(all_measures.groupby(level=[0,1]))} files.")
all_measures.head()
7564 measures over 54 files.
mc mn quarterbeats duration_qb keysig timesig act_dur mc_offset numbering_offset dont_count barline breaks repeats next markers jump_bwd jump_fwd play_until quarterbeats_all_endings volta
corpus fname interval
mozart_piano_sonatas K279-1 [0.0, 4.0) 1 1 0 4.0 0 4/4 1 0 <NA> <NA> <NA> <NA> firstMeasure (2,) NaN NaN NaN NaN NaN <NA>
[4.0, 8.0) 2 2 4 4.0 0 4/4 1 0 <NA> <NA> <NA> line <NA> (3,) NaN NaN NaN NaN NaN <NA>
[8.0, 12.0) 3 3 8 4.0 0 4/4 1 0 <NA> <NA> <NA> <NA> <NA> (4,) NaN NaN NaN NaN NaN <NA>
[12.0, 16.0) 4 4 12 4.0 0 4/4 1 0 <NA> <NA> <NA> <NA> <NA> (5,) NaN NaN NaN NaN NaN <NA>
[16.0, 20.0) 5 5 16 4.0 0 4/4 1 0 <NA> <NA> <NA> line <NA> (6,) NaN NaN NaN NaN NaN <NA>
print("Distribution of time signatures per XML measure (MC):")
all_measures.timesig.value_counts(dropna=False)
Distribution of time signatures per XML measure (MC):
3/4    1990
2/4    1691
4/4    1396
2/2    1065
6/8     847
3/8     575
Name: timesig, dtype: int64

Harmony labels#

All symbols, independent of the local key (the mode of which changes their semantics).

try:
    all_annotations = dataset.get_facet('expanded')
except Exception:
    all_annotations = pd.DataFrame()
n_annotations = len(all_annotations.index)
includes_annotations = n_annotations > 0
if includes_annotations:
    display(all_annotations.head())
    print(f"Concatenated annotation tables contains {all_annotations.shape[0]} rows.")
    no_chord = all_annotations.root.isna()
    if no_chord.sum() > 0:
        print(f"{no_chord.sum()} of them are not chords. Their values are: {all_annotations.label[no_chord].value_counts(dropna=False).to_dict()}")
    all_chords = all_annotations[~no_chord].copy()
    print(f"Dataset contains {all_chords.shape[0]} tokens and {len(all_chords.chord.unique())} types over {len(all_chords.groupby(level=[0,1]))} documents.")
    all_annotations['corpus_name'] = all_annotations.index.get_level_values(0).map(get_corpus_display_name)
    all_chords['corpus_name'] = all_chords.index.get_level_values(0).map(get_corpus_display_name)
else:
    print(f"Dataset contains no annotations.")
mc mn quarterbeats quarterbeats_all_endings duration_qb mc_onset mn_onset timesig staff voice ... chord_type globalkey_is_minor localkey_is_minor chord_tones added_tones root bass_note special alt_label volta
corpus fname interval
mozart_piano_sonatas K279-1 [0.0, 4.0) 1 1 0 0 4.0 0 0 4/4 2 1 ... M False False (0, 4, 1) () 0 0 <NA> <NA> <NA>
[4.0, 6.0) 2 2 4 4 2.0 0 0 4/4 2 1 ... m False False (-1, 3, 2) () 2 -1 <NA> <NA> <NA>
[6.0, 6.5) 2 2 6 6 0.5 1/2 1/2 4/4 2 1 ... Mm7 False False (-1, 1, 5, 2) () 1 -1 <NA> <NA> <NA>
[6.5, 7.0) 2 2 13/2 13/2 0.5 5/8 5/8 4/4 2 1 ... M False False (4, 1, 0) () 0 4 <NA> <NA> <NA>
[7.0, 7.5) 2 2 7 7 0.5 3/4 3/4 4/4 2 1 ... m False False (-1, 3, 2) () 2 -1 <NA> <NA> <NA>

5 rows × 32 columns

Concatenated annotation tables contains 15236 rows.
241 of them are not chords. Their values are: {'{': 224, '|HC': 15, '|PAC': 1, '|DC': 1}
Dataset contains 14995 tokens and 466 types over 54 documents.