Course 8: Learning with Music Signals
Main Tutor/Lecturer: Simon Schwär, Prof. Dr. Meinard Müller
The extraction of fundamental frequency (F0) information from music recordings is a crucial task in the field of music information retrieval. The sequence of F0 estimates over successive time frames (also called F0 trajectory) often corresponds to a melodic phrase and serves as a representation for downstream tasks such as automatic music transcription and performance analysis. A large number of algorithms and tools for F0 estimation have been proposed in the literature. In this group, we delve into this important topic, studying various approaches to F0 estimation, including traditional signal processing methods such as YIN and SWIPE, and data-driven methods using deep learning, including CREPE, SPICE, and PESTO. In doing so, we also gain a deeper understanding of the acoustic properties of music signals.
@article{CheveigneK02_YIN_JASA, author = {Alain de Cheveigné and Hideki Kawahara}, title = {{YIN}, a fundamental frequency estimator for speech and music.}, journal = {Journal of the Acoustical Society of America (JASA)}, year = {2002}, volume = {111}, pages = {1917--1930}, number = {4}, url-pdf = {2002_CheveigneK_YIN_JASA.pdf} }
@article{CamachoH08_SawtoothWaveform_JASA, author = {Arturo Camacho and John G. Harris}, title = {A sawtooth waveform inspired pitch estimator for speech and music}, publisher = {ASA}, year = {2008}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {3}, pages = {1638--1652}, url-pdf = {2008_CamachoH_SWIPE_JASA.pdf} }
@article{SalamonG12_MelodyExtraction_TASLP, Author = {Justin Salamon and Emilia G{\'o}mez}, Title = {Melody Extraction from Polyphonic Music Signals using Pitch Contour Characteristics}, Journal = {IEEE Transactions on Audio, Speech, and Language Processing}, Number = {6}, Volume = {20}, Pages = {1759--1770}, Year = {2012}, doi = {10.1109/TASL.2012.2188515}, url-pdf = {2012_SalamonG_Melodia_TALSP.pdf} }
@inproceedings{MauchD14_pYIN_ICASSP, author = {Matthias Mauch and Simon Dixon}, title = {{pYIN}: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions}, booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, year = {2014}, address = {Florence, Italy}, pages = {659--663}, url-pdf = {2014_MauchD_pYIN_ICASSP.pdf} }
@inproceedings{KimSLB18_CREPE_ICASSP, author = {Jong Wook Kim and Justin Salamon and Peter Li and Juan Pablo Bello}, title = {{CREPE}: {A} Convolutional Representation for Pitch Estimation}, booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, address = {Calgary, Canada}, pages = {161--165}, year = {2018}, doi = {10.1109/ICASSP.2018.8461329}, url-pdf = {2018_KimSLB_PtichEstCREPE_ICASSP_arXiv} }
@article{GfellerFRSTV20_SPICE_IEEE, author = {Beat Gfeller and Christian Frank and Dominik Roblek and Matthew Sharifi and Marco Tagliasacchi and Mihajlo Velimirovic}, title = {{SPICE:} {S}elf-supervised Pitch Estimation}, journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing}, volume = {28}, pages = {1118--1128}, year = {2020}, url-pdf = {2020_GfellerEtAl_SPICE_TASLP.pdf} }
@inproceedings{RiouLHP23_PESTO_ISMIR, author = {Riou, Alain and Lattner, Stefan and Hadjeres, Gaëtan and Peeters, Geoffroy}, title = {{PESTO}: Pitch Estimation with Self-supervised Transposition-equivariant Objective}, booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})}, pages = {535--544}, year = {2023}, address = {Milano, Italy}, url-pdf = {2023_RiouLHP_PESTO_ISMIR.pdf} }