Overview#
This notebook gives a general overview of the features included in the dataset.
Show imports
import os
from collections import defaultdict, Counter
from fractions import Fraction
from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import CADENCE_COLORS, CORPUS_COLOR_SCALE, STD_LAYOUT, TYPE_COLORS, color_background, corpus_mean_composition_years, value_count_df, get_corpus_display_name, get_repo_name, print_heading, resolve_dir
Show source
CORPUS_PATH = os.path.abspath(os.path.join('..', '..'))
ANNOTATED_ONLY = os.getenv("ANNOTATED_ONLY", "True").lower() in ('true', '1', 't')
print_heading("Notebook settings")
print(f"CORPUS_PATH: {CORPUS_PATH!r}")
print(f"ANNOTATED_ONLY: {ANNOTATED_ONLY}")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
Notebook settings
-----------------
CORPUS_PATH: '/home/runner/work/workflow_deployment/schubert_winterreise'
ANNOTATED_ONLY: False
Show source
repo = Repo(CORPUS_PATH)
print_heading("Data and software versions")
print(f"Data repo '{get_repo_name(repo)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Data and software versions
--------------------------
Data repo 'schubert_winterreise' @ 30e2245
dimcat version 0.3.0
ms3 version 2.5.2
dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n01'. I'm picking 'labels/v0.1/n01.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n02'. I'm picking 'labels/v0.1/n02.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n03'. I'm picking 'labels/v0.1/n03.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n04'. I'm picking 'labels/v0.1/n04.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n05'. I'm picking 'labels/v0.1/n05.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n06'. I'm picking 'labels/v0.1/n06.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n07'. I'm picking 'labels/v0.1/n07.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n08'. I'm picking 'labels/v0.1/n08.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n09'. I'm picking 'labels/v0.1/n09.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n10'. I'm picking 'labels/v0.1/n10.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n11'. I'm picking 'labels/v0.1/n11.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n12'. I'm picking 'labels/v0.1/n12.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n13'. I'm picking 'labels/v0.1/n13.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n14'. I'm picking 'labels/v0.1/n14.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n15'. I'm picking 'labels/v0.1/n15.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n16'. I'm picking 'labels/v0.1/n16.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n17'. I'm picking 'labels/v0.1/n17.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n18'. I'm picking 'labels/v0.1/n18.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n19'. I'm picking 'labels/v0.1/n19.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n20'. I'm picking 'labels/v0.1/n20.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n21'. I'm picking 'labels/v0.1/n21.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n22'. I'm picking 'labels/v0.1/n22.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n23'. I'm picking 'labels/v0.1/n23.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
WARNING ms3.utils.functions -- /home/runner/.local/lib/python3.10/site-packages/ms3/utils/functions.py (line 6191) automatically_choose_from_disambiguated_files():
Unable to automatically choose from the 2 'labels' with piece 'n24'. I'm picking 'labels/v0.1/n24.labels.tsv' because its disambiguation string '/v0.1' is the lexicographically first among ['/v0.1', '/v0.2']
[default|all]
All corpora
-----------
View: This view is called 'default'. It
- excludes pieces that are not contained in the metadata,
- filters out file extensions requiring conversion (such as .xml), and
- excludes review files and folders.
has active scores measures notes labels expanded chords
metadata view detected detected parsed detected parsed detected parsed detected parsed detected parsed
corpus
schubert_winterreise yes default 24 24 24 24 24 48 24 24 24 24 24
168/696 files are excluded from this view.
168 files have been excluded based on their subdir.
N = 24 annotated pieces, 120 parsed dataframes.
Show data loading
all_metadata = dataset.data.metadata()
assert len(all_metadata) > 0, "No pieces selected for analysis."
print(f"Metadata covers {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_notes = dataset.get_facet('notes')
all_measures = dataset.get_facet('measures')
mean_composition_years = corpus_mean_composition_years(all_metadata)
chronological_order = mean_composition_years.index.to_list()
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
Metadata covers 24 of the 24 scores.
Composition dates#
This section relies on the dataset’s metadata.
valid_composed_start = pd.to_numeric(all_metadata.composed_start, errors='coerce')
valid_composed_end = pd.to_numeric(all_metadata.composed_end, errors='coerce')
print(f"Composition dates range from {int(valid_composed_start.min())} {valid_composed_start.idxmin()} "
f"to {int(valid_composed_end.max())} {valid_composed_end.idxmax()}.")
Composition dates range from 1827 ('schubert_winterreise', 'n01') to 1828 ('schubert_winterreise', 'n01').
Mean composition years per corpus#
Show source
summary = all_metadata.copy()
summary.length_qb = all_measures.groupby(level=[0,1]).act_dur.sum() * 4.0
summary = pd.concat([summary,
all_notes.groupby(level=[0,1]).size().rename('notes'),
], axis=1)
bar_data = pd.concat([mean_composition_years.rename('year'),
summary.groupby(level='corpus').size().rename('pieces')],
axis=1
).reset_index()
fig = px.bar(bar_data, x='year', y='pieces', color='corpus',
color_discrete_map=corpus_colors,
)
fig.update_traces(width=5)
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_traces(width=5)
Composition years histogram#
Show source
hist_data = summary.reset_index()
hist_data.corpus = hist_data.corpus.map(corpus_names)
fig = px.histogram(hist_data, x='composed_end', color='corpus',
labels=dict(composed_end='decade',
count='pieces',
),
color_discrete_map=corpus_name_colors,
)
fig.update_traces(xbins=dict(
size=10
))
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.show()
Dimensions#
Overview#
Show source
corpus_metadata = summary.groupby(level=0)
n_pieces = corpus_metadata.size().rename('pieces')
absolute_numbers = dict(
measures = corpus_metadata.last_mn.sum(),
length = corpus_metadata.length_qb.sum(),
notes = corpus_metadata.notes.sum(),
labels = corpus_metadata.label_count.sum(),
)
absolute = pd.DataFrame.from_dict(absolute_numbers)
absolute = pd.concat([n_pieces, absolute], axis=1)
sum_row = pd.DataFrame(absolute.sum(), columns=['sum']).T
absolute = pd.concat([absolute, sum_row])
relative = absolute.div(n_pieces, axis=0)
complete_summary = pd.concat([absolute, relative, absolute.iloc[:1,2:].div(absolute.measures, axis=0)], axis=1, keys=['absolute', 'per piece', 'per measure'])
complete_summary = complete_summary.apply(pd.to_numeric).round(2)
complete_summary.index = complete_summary.index.map(dict(corpus_names, sum='sum'))
complete_summary
absolute | per piece | per measure | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pieces | measures | length | notes | labels | pieces | measures | length | notes | labels | length | notes | labels | |
Schubert Winterreise | 24 | 1417 | 4052.0 | 26614 | 3100 | 1.0 | 59.04 | 168.83 | 1108.92 | 129.17 | 2.86 | 18.78 | 2.19 |
sum | 24 | 1417 | 4052.0 | 26614 | 3100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Measures#
print(f"{len(all_measures.index)} measures over {len(all_measures.groupby(level=[0,1]))} files.")
all_measures.head()
1425 measures over 24 files.
mc | mn | quarterbeats | duration_qb | keysig | timesig | act_dur | mc_offset | numbering_offset | dont_count | barline | breaks | repeats | next | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | fname | interval | ||||||||||||||
schubert_winterreise | n01 | [0.0, 2.0) | 1 | 1 | 0 | 2.0 | -1 | 2/4 | 1/2 | 0 | <NA> | <NA> | <NA> | <NA> | firstMeasure | (2,) |
[2.0, 4.0) | 2 | 2 | 2 | 2.0 | -1 | 2/4 | 1/2 | 0 | <NA> | <NA> | <NA> | <NA> | <NA> | (3,) | ||
[4.0, 6.0) | 3 | 3 | 4 | 2.0 | -1 | 2/4 | 1/2 | 0 | <NA> | <NA> | <NA> | <NA> | <NA> | (4,) | ||
[6.0, 8.0) | 4 | 4 | 6 | 2.0 | -1 | 2/4 | 1/2 | 0 | <NA> | <NA> | <NA> | <NA> | <NA> | (5,) | ||
[8.0, 10.0) | 5 | 5 | 8 | 2.0 | -1 | 2/4 | 1/2 | 0 | <NA> | <NA> | <NA> | <NA> | <NA> | (6,) |
print("Distribution of time signatures per XML measure (MC):")
all_measures.timesig.value_counts(dropna=False)
Distribution of time signatures per XML measure (MC):
2/4 502
3/4 371
6/8 244
4/4 160
2/2 56
12/8 49
3/8 43
Name: timesig, dtype: int64
Harmony labels#
All symbols, independent of the local key (the mode of which changes their semantics).
try:
all_annotations = dataset.get_facet('expanded')
except Exception:
all_annotations = pd.DataFrame()
n_annotations = len(all_annotations.index)
includes_annotations = n_annotations > 0
if includes_annotations:
display(all_annotations.head())
print(f"Concatenated annotation tables contains {all_annotations.shape[0]} rows.")
no_chord = all_annotations.root.isna()
if no_chord.sum() > 0:
print(f"{no_chord.sum()} of them are not chords. Their values are: {all_annotations.label[no_chord].value_counts(dropna=False).to_dict()}")
all_chords = all_annotations[~no_chord].copy()
print(f"Dataset contains {all_chords.shape[0]} tokens and {len(all_chords.chord.unique())} types over {len(all_chords.groupby(level=[0,1]))} documents.")
all_annotations['corpus_name'] = all_annotations.index.get_level_values(0).map(get_corpus_display_name)
all_chords['corpus_name'] = all_chords.index.get_level_values(0).map(get_corpus_display_name)
else:
print(f"Dataset contains no annotations.")
mc | mn | quarterbeats | quarterbeats_all_endings | duration_qb | mc_onset | mn_onset | timesig | staff | voice | ... | cadence | phraseend | chord_type | globalkey_is_minor | localkey_is_minor | chord_tones | added_tones | root | bass_note | special | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
corpus | fname | interval | |||||||||||||||||||||
schubert_winterreise | n01 | [0.0, 2.0) | 1 | 1 | 0 | 0 | 2.000 | 0 | 0 | 2/4 | 3 | 1 | ... | <NA> | <NA> | m | True | True | (0, -3, 1) | () | 0 | 0 | <NA> |
[2.0, 2.5) | 2 | 2 | 2 | 2 | 0.500 | 0 | 0 | 2/4 | 3 | 1 | ... | <NA> | <NA> | m | True | True | (0, -3, 1) | (2,) | 0 | 0 | <NA> | ||
[2.5, 3.5) | 2 | 2 | 5/2 | 5/2 | 1.000 | 1/8 | 1/8 | 2/4 | 3 | 1 | ... | <NA> | <NA> | m | True | True | (0, -3, 1) | () | 0 | 0 | <NA> | ||
[3.5, 3.875) | 2 | 2 | 7/2 | 7/2 | 0.375 | 3/8 | 3/8 | 2/4 | 3 | 1 | ... | <NA> | <NA> | o7 | True | True | (-1, -4, 5, -3) | () | 5 | -1 | <NA> | ||
[3.875, 4.0) | 2 | 2 | 31/8 | 31/8 | 0.125 | 15/32 | 15/32 | 2/4 | 3 | 1 | ... | <NA> | <NA> | o7 | True | True | (-1, -4, 5, 2) | () | 5 | -1 | <NA> |
5 rows × 31 columns
Concatenated annotation tables contains 3100 rows.
Dataset contains 3100 tokens and 308 types over 24 documents.