Notes#

Notebook settings
-----------------

CORPUS_PATH: '/home/runner/work/workflow_deployment/grieg_lyric_pieces'
ANNOTATED_ONLY: False

Data and software versions
--------------------------

Data repo 'grieg_lyric_pieces' @ d5f69bc
dimcat version 0.3.0
ms3 version 2.5.2

dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)

[default|all]
All corpora
-----------
View: This view is called 'default'. It 
	- excludes pieces that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

                        has   active   scores measures           notes        expanded          chords       
                   metadata     view detected detected parsed detected parsed detected parsed detected parsed
corpus                                                                                                       
grieg_lyric_pieces      yes  default       66       66     66       66     66       66     66       66     66

462/1518 files are excluded from this view.

462 files have been excluded based on their subdir.

N = 66 annotated pieces, 264 parsed dataframes.

Metadata#

all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:,:20]

Concatenated 'metadata.tsv' files cover 66 of the 66 scores.

	piece	TimeSig	KeySig	last_mc	last_mn	length_qb	last_mc_unfolded	last_mn_unfolded	length_qb_unfolded	volta_mcs	all_notes_qb	n_onsets	n_onset_positions	guitar_chord_count	form_label_count	label_count	annotated_key	harmony_version	annotators	reviewers
corpus
grieg_lyric_pieces	op12n01	{1: '2/4'}	{1: -3}	23	23	46.0	23	23	46.0		135.5	268	156	0	0	43	Eb	2.3.0	Adrian Nagel (2.1.1), John Heilig (2.30)	Adrian Nagel

Compute chronological order

chronological_order = chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
chronological_order

['grieg_lyric_pieces']

all_notes = dataset.data.get_all_parsed('notes', force=True, flat=True)
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()

WARNING  ms3.Parse.grieg_lyric_pieces -- /home/runner/.local/lib/python3.10/site-packages/ms3/corpus.py (line 1255) check_number_of_unparsed_scores():
	You have set force=True, which forces me to parse 66 scores iteratively. Next time, call _.parse() on me, so we can speed this up!

65818 notes over 66 files.

			mc	mn	quarterbeats	quarterbeats_all_endings	duration_qb	mc_onset	mn_onset	timesig	staff	voice	...	nominal_duration	scalar	tied	tpc	midi	name	octave	chord_id	volta	tremolo
corpus	piece	i
grieg_lyric_pieces	op12n01	0	1	1	0	0	2.00	0	0	2/4	2	2	...	1/2	1	1	-3	51	Eb3	3	12	<NA>	NaN
		1	1	1	0	0	0.25	0	0	2/4	2	1	...	1/16	1	<NA>	-2	58	Bb3	3	4	<NA>	NaN
		2	1	1	0	0	0.50	0	0	2/4	1	1	...	1/8	1	<NA>	1	79	G5	5	0	<NA>	NaN
		3	1	1	1/4	1/4	0.25	1/16	1/16	2/4	2	1	...	1/16	1	<NA>	-3	63	Eb4	4	5	<NA>	NaN
		4	1	1	1/2	1/2	0.25	1/8	1/8	2/4	2	1	...	1/16	1	<NA>	1	67	G4	4	6	<NA>	NaN

5 rows × 22 columns

def weight_notes(nl, group_col='midi', precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    shortest_duration = summed_durations[summed_durations > 0].min()
    summed_durations /= shortest_duration # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)
    
def repeat_notes_according_to_weights(weights):
    try:
        counts = weights.round().astype(int)
    except Exception:
        return pd.Series(dtype=int)
    counts_reflecting_weights = []
    for pitch, count in counts.items():
        counts_reflecting_weights.extend([pitch]*count)
    return pd.Series(counts_reflecting_weights)

Ambitus#

corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
all_notes['corpus_name'] = all_notes.index.get_level_values(0).map(corpus_names)

grouped_notes = all_notes.groupby('corpus_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi

	dataset	midi
0	Grieg Lyric Pieces	21
1	Grieg Lyric Pieces	23
2	Grieg Lyric Pieces	23
3	Grieg Lyric Pieces	23
4	Grieg Lyric Pieces	23
...	...	...
27852	Grieg Lyric Pieces	96
27853	Grieg Lyric Pieces	96
27854	Grieg Lyric Pieces	97
27855	Grieg Lyric Pieces	99
27856	Grieg Lyric Pieces	102

27857 rows × 2 columns

yaxis=dict(tickmode= 'array',
           tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
           ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
           gridcolor='lightgrey',
           )
fig = px.violin(weighted_midi, 
                x='dataset', 
                y='midi', 
                color='dataset', 
                box=True,
                labels=dict(
                    dataset='',
                    midi='distribution of pitches by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000, height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis, 
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

Tonal Pitch Classes (TPC)#

weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_tpc.columns = ['dataset', 'tpc']
weighted_tpc

	dataset	tpc
0	Grieg Lyric Pieces	-11
1	Grieg Lyric Pieces	-10
2	Grieg Lyric Pieces	-10
3	Grieg Lyric Pieces	-10
4	Grieg Lyric Pieces	-10
...	...	...
74268	Grieg Lyric Pieces	16
74269	Grieg Lyric Pieces	16
74270	Grieg Lyric Pieces	16
74271	Grieg Lyric Pieces	16
74272	Grieg Lyric Pieces	16

74273 rows × 2 columns

As violin plot#

yaxis=dict(
    tickmode= 'array',
    tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
    ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
    gridcolor='lightgrey',
    zerolinecolor='lightgrey',
    zeroline=True
           )
fig = px.violin(weighted_tpc, 
                x='dataset', 
                y='tpc', 
                color='dataset', 
                box=True,
                labels=dict(
                    dataset='',
                    tpc='distribution of tonal pitch classes by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000, 
                height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis, 
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

As bar plots#

bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
             labels=dict(tpc='Named pitch class',
                             duration_qb='Duration in quarter notes'
                            ),
             color_discrete_sequence=CORPUS_COLOR_SCALE,
             width=1000, height=300,
             )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array', 
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black', 
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()

scatter_data = all_notes.groupby(['corpus_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.bar(scatter_data, x='tpc', y='duration_qb', color='corpus_name', 
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 width=1000, height=500,
                )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array', 
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black', 
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()

As scatter plots#

fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='corpus_name', 
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 facet_col='corpus_name', facet_col_wrap=3, facet_col_spacing=0.03,
                 width=1000, height=1000,
                )
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
    ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.show()

no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()

entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"

'Fraction of note duration without accidental of the entire durations: 38437.7875 / 55705.291666666664 = 0.6900203975235746'

Notes and staves#

print("Distribution of notes over staves:")
value_count_df(all_notes.staff)

Distribution of notes over staves:

	counts	%
staff
1	36716	0.557841
2	28680	0.435747
3	422	0.006412