Many recent advances in audio, speech, and music processing have been driven by techniques based on deep learning (DL). For example, DL-based techniques have led to significant improvements in, for example, speaker separation, speech synthesis, acoustic scene analysis, audio retrieval, chord recognition, melody estimation, and beat tracking. Considering specific audio, speech, and music processing tasks, we study various DL-based approaches and their capability to extract complex features and make predictions based on hidden structures and relations. Rather than giving a comprehensive overview, we will study selected and generally applicable DL-based techniques. Furthermore, in the context of challenging application scenarios, we will critically review the potential and limitations of recent deep learning techniques. As one main general objective of the lecture, we want to discuss how you can integrate domain knowledge into neural network architectures to obtain explainable models that are less vulnerable to data biases and confounding factors.
The course consists of two overview-like lectures, where we introduce current research problems in audio, speech, and music processing. We will then continue with 6 to 8 lectures on selected audio processing topics and DL-based techniques. Being based on articles from the research literature, we will provide detailed explanations covered in mathematical depth; we may also try to attract some of the original authors to serve as guest lecturers. Finally, we round off the course by a concluding lecture covering practical aspects (e.g., hardware, software, version control, reproducibility, datasets) that are relevant when working with DL-based techniques.
In this course, we require a good knowledge of deep learning techniques, machine learning, and pattern recognition as well as a strong mathematical background. Furthermore, we require a solid background in general digital signal processing and some experience with audio, image, or video processing.
It is recommended to finish the following modules (or having equivalent knowledge) before starting this module:
There will be oral examinations (30 minutes) either in July or October. In the exam, you should be to able to summarize the lectures' content and to answer general questions as listed below. Additionally, you need to pick one of the lectures (Lecture 3 to Lecture 9) as your in-depth topic, where you should be able to answer detailed technical questions about the specified papers. For further details and appointments, please see check StudOn.
The course consists of two overview-like lectures, where we introduce current research problems in audio, speech, and music processing. We will then continue with 6 to 8 lectures wich are based on articles from the research literature. The lecture material includes handouts of slides, links to the original articles, and possibly links to demonstrators and further online resources. In the following list, you find links to the material. If you have any questions regarding the lecture, please contact Prof. Dr. ir. Emanuël Habets and Prof. Dr. Meinard Müller.
The following tentative schedule gives an overview:
@book{Mueller15_FMP_SPRINGER,
author = {Meinard M{\"u}ller},
title = {Fundamentals of Music Processing},
type = {Monograph},
year = {2015},
publisher = {Springer Verlag}
}
@inproceedings{MuellerZ19_FMP_ISMIR,
author = {Meinard M{\"u}ller and Frank Zalkow},
title = {{FMP} {N}otebooks: {E}ducational Material for Teaching and Learning Fundamentals of Music Processing},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
address = {Delft, The Netherlands},
pages = {573--580},
year = {2019},
doi = {10.5281/zenodo.3527872}
}
@inproceedings{HersheyCRW16_DeepClustering_ICASSP,
author = {John R. Hershey and Zhuo Chen and Jonathan Le Roux and Shinji Watanabe},
title = {Deep clustering: Discriminative embeddings for segmentation and separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {31--35},
year = {2016},
doi = {10.1109/ICASSP.2016.7471631}
}
@inproceedings{ChenLM17_DeepAttractor,
author = {Zhuo Chen and Yi Luo and Nima Mesgarani},
title = {Deep attractor network for single-microphone speaker separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {246--250},
year = {2017},
doi = {10.1109/ICASSP.2017.7952155}
}
@article{KolbaekYTJ17_SpeechSep_TASLP,
author = {Morten Kolbaek and Dong Yu and Zheng-Hua Tan and Jesper Jensen},
title = {Multitalker Speech Separation With Utterance-Level Permutation Invariant Training of Deep Recurrent Neural Networks},
journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
volume = {25},
number = {10},
pages = {1901--1913},
year = {2017},
url = {https://doi.org/10.1109/TASLP.2017.2726762},
doi = {10.1109/TASLP.2017.2726762}
}
@inproceedings{KavalerovWEPWRH19_UniversalSoundSep_WASPAA,
author = {Ilya Kavalerov and Scott Wisdom and Hakan Erdogan and Brian Patton and Kevin W. Wilson and Jonathan Le Roux and John R. Hershey},
title = {Universal Sound Separation},
booktitle = {Proceedings of the {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics ({WASPAA})},
pages = {175--179},
year = {2019},
url = {https://doi.org/10.1109/WASPAA.2019.8937253},
doi = {10.1109/WASPAA.2019.8937253}
}
@inproceedings{BrieglebSK01_EgeNoise_ICA,
author = {Annika Briegleb and Alexander Schmidt and Walter Kellermann},
title = {Deep Clustering for Single-Channel Ego-Noise Suppression},
booktitle = {Proceedings of the International Congress on Acoustics ({ICA})},
pages = {2813--2820},
year = {2019},
doi = {10.18154/RWTH-CONV-239374},
url-pdf = {https://pub.dega-akustik.de/ICA2019/data/articles/000705.pdf}
}
@inproceedings{HersheyCRW16_DeepClustering_ICASSP,
author = {John R. Hershey and Zhuo Chen and Jonathan Le Roux and Shinji Watanabe},
title = {Deep clustering: Discriminative embeddings for segmentation and separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {31--35},
year = {2016},
doi = {10.1109/ICASSP.2016.7471631}
}
@inproceedings{ChenLM17_DeepAttractor,
author = {Zhuo Chen and Yi Luo and Nima Mesgarani},
title = {Deep attractor network for single-microphone speaker separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {246--250},
year = {2017},
doi = {10.1109/ICASSP.2017.7952155}
}
@article{LuoCM18_DeepAttractorNetwork_TASLP,
author = {Yi Luo and Zhuo Chen and Nima Mesgarani},
title = {Speaker-Independent Speech Separation With Deep Attractor Network},
journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
volume = {26},
number = {4},
pages = {787--796},
year = {2018},
url = {https://doi.org/10.1109/TASLP.2018.2795749},
doi = {10.1109/TASLP.2018.2795749}
}
@inproceedings{JanssonHMBKW17_SingingSep_ISMIR,
author = {Andreas Jansson and Eric J. Humphrey and Nicola Montecchio and Rachel M. Bittner and Aparna Kumar and Tillman Weyde},
title = {Singing Voice Separation with Deep {U}-{N}et Convolutional Networks},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
pages = {745--751},
year = {2017},
url = {https://ismir2017.smcnus.org/wp-content/uploads/2017/10/171\_Paper.pdf},
doi = {10.5281/zenodo.1414934}
}
@article{StoterULM19_Unmix_JOSS,
author = {Fabian{-}Robert St{\"{o}}ter and Stefan Uhlich and Antoine Liutkus and Yuki Mitsufuji},
title = {{Open-Unmix} -- {A} Reference Implementation for Music Source Separation},
journal = {Journal of Open Source Software ({JOSS})},
volume = {4},
number = {41},
pages = {1667},
year = {2019},
url = {https://doi.org/10.21105/joss.01667},
doi = {10.21105/joss.01667}
}
@inproceedings{RonnebergerFB15_UNet_LNCS,
author = {Olaf Ronneberger and Philipp Fischer and Thomas Brox},
editor = {Nassir Navab and
Joachim Hornegger and William M. Wells III and Alejandro F. Frangi},
title = {{U}-{N}et: {C}onvolutional Networks for Biomedical Image Segmentation},
booktitle = {Proceedings of Medical Image Computing and Computer-Assisted Intervention ({MICCAI})},
series = {Lecture Notes in Computer Science},
volume = {9351},
pages = {234--241},
publisher = {Springer},
year = {2015},
url = {https://doi.org/10.1007/978-3-319-24574-4_28},
doi = {10.1007/978-3-319-24574-4_28}
}
@article{OdenaDO16_deconvolution_Destill,
author = {Augustus Odena and Vincent Dumoulin and Chris Olah},
title = {Deconvolution and Checkerboard Artifacts},
journal = {Distill},
year = {2016},
volume = {1},
number = {10},
url = {http://distill.pub/2016/deconv-checkerboard},
doi = {10.23915/distill.00003}
}
@inproceedings{SmaragdisV17_NMFAutoencoder_ICASSP,
author = {Paris Smaragdis and Shrikant Venkataramani},
title = {A neural network alternative to non-negative audio models},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
address = {New Orleans, Louisiana, USA},
pages = {86--90},
year = {2017},
url = {https://doi.org/10.1109/ICASSP.2017.7952123},
doi = {10.1109/ICASSP.2017.7952123}
}
@inproceedings{EwertS17_StructuredDropout_ICASSP,
author = {Sebastian Ewert and Mark B. Sandler},
title = {Structured Dropout for Weak Label and Multi-Instance Learning and Its Application to Score-Informed Source Separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
address = {New Orleans, Louisiana, USA},
pages = {2277--2281},
year = {2017},
url = {https://doi.org/10.1109/ICASSP.2017.7952562}
doi = {10.1109/ICASSP.2017.7952562}
}
@inproceedings{EwertM12_ScoreInformedNMF_ICASSP,
author = {Sebastian Ewert and Meinard M{\"u}ller},
title = {Using Score-Informed Constraints for {NMF}-based Source Separation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
address = {Kyoto, Japan},
year = {2012},
pages = {129--132},
month = {March},
url-details = {http://resources.mpi-inf.mpg.de/MIR/ICASSP2012-ScoreInformedNMF/}
}
@article{ZmolikovaDKONBC19_SpeakerBeam_JSTSP,
author = {Katerina Zmolikov{\'{a}} and Marc Delcroix and Keisuke Kinoshita and Tsubasa Ochiai and Tomohiro Nakatani and Luk{\'{a}}s Burget and Jan Cernocky},
title = {{SpeakerBeam}: {S}peaker Aware Neural Network for Target Speaker Extraction
in Speech Mixtures},
journal = {{IEEE} Journal on Selected Topics in Signal Processing},
volume = {13},
number = {4},
pages = {800--814},
year = {2019},
url = {https://doi.org/10.1109/JSTSP.2019.2922820},
doi = {10.1109/JSTSP.2019.2922820}
}
@inproceedings{MackBCH20_DOA_ICASSP,
author = {Wolfgang Mack and Ullas Bharadwaj and Soumitro Chakrabarty and Emanu{\"{e}}l A. P. Habets},
title = {Signal-Aware Broadband {DOA} Estimation Using Attention Mechanisms},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {4930--4934},
publisher = {{IEEE}},
year = {2020},
url = {https://doi.org/10.1109/ICASSP40776.2020.9053658},
doi = {10.1109/ICASSP40776.2020.9053658}
}
@inproceedings{OchiaiDKON19_SpeakerBeam_Interspeech,
author = {Tsubasa Ochiai and Marc Delcroix and Keisuke Kinoshita and Atsunori Ogawa and Tomohiro Nakatani},
title = {Multimodal {SpeakerBeam}: {S}ingle Channel Target Speech Extraction with
Audio-Visual Speaker Clues},
booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association (Interspeech)},
pages = {2718--2722},
publisher = {{ISCA}},
year = {2019},
url = {https://doi.org/10.21437/Interspeech.2019-1513},
doi = {10.21437/Interspeech.2019-1513}
}
@article{WangLY18_SpeakerLoc_AppliedSciences,
author = {Ziteng Wang and Junfeng Li and Yonghong Yan},
title = {Target Speaker Localization Based on the Complex {W}atson Mixture Model and Time-Frequency Selection Neural Network},
journal = {Applied Sciences},
volume = {8},
year = {2018},
number = {11},
url-pdf = {https://www.mdpi.com/2076-3417/8/11/2326}
}
@inproceedings{ShenPWSJYCZWRSA18_TTS_ICASSP,
author = {Jonathan Shen and Ruoming Pang and Ron J. Weiss and Mike Schuster and Navdeep Jaitly and Zongheng Yang and Zhifeng Chen and Yu Zhang and Yuxuan Wang and RJ-Skerrv Ryan and Rif A. Saurous and Yannis Agiomyrgiannakis and Yonghui Wu},
title = {Natural {TTS} Synthesis by Conditioning Wavenet on {MEL} Spectrogram Predictions},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {4779--4783},
year = {2018},
url = {https://doi.org/10.1109/ICASSP.2018.8461368},
doi = {10.1109/ICASSP.2018.8461368}
}
@inproceedings{RenRTQZZL19_FastSpeech_NeurIPS,
author = {Yi Ren and Yangjun Ruan and Xu Tan and Tao Qin and Sheng Zhao and Zhou Zhao and Tie-Yan Liu},
title = {{FastSpeech}: {F}ast, Robust and Controllable Text to Speech},
booktitle = {Proceedings of the Annual Conference on Neural Information Processing Systems},
pages = {3165--3174},
year = {2019},
url-pdf = {https://proceedings.neurips.cc/paper/2019/file/f63f65b503e22cb970527f23c9ad7db1-Paper.pdf},
}
@inproceedings{MustafaPF21_StyleMelGAN_ICASSP,
author={Ahmed Mustafa and Nicola Pia and Guillaume Fuchs},
booktitle={Proceedings of the {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
title={{StyleMelGAN}: {A}n Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization},
year={2021},
volume={},
number={},
pages={6034--6038},
doi={10.1109/ICASSP39728.2021.9413605},
url-pdf = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9413605},
}
@inproceedings{WangSSWWJYXCBLA17_Tacotron_Interspeech,
author = {Yuxuan Wang and R. J. Skerry-Ryan and Daisy Stanton and Yonghui Wu and Ron J. Weiss and Navdeep Jaitly and Zongheng Yang and Ying Xiao and Zhifeng Chen and Samy Bengio and Quoc V. Le and Yannis Agiomyrgiannakis and Rob Clark and Rif A. Saurous},
title = {{Tacotron}: {T}owards End-to-End Speech Synthesis},
booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association (Interspeech)},
pages = {4006--4010},
publisher = {{ISCA}},
year = {2017},
url-pdf = {https://www.isca-speech.org/archive/Interspeech_2017/pdfs/1452.PDF}
}
@inproceedings{ZalkowM20_WeaklyAlignedTrain_ISMIR,
author = {Frank Zalkow and Meinard M{\"u}ller},
title = {Using Weakly Aligned Score--Audio Pairs to Train Deep Chroma Models for Cross-Modal Music Retrieval},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
address = {Montr{\'{e}}al, Canada},
pages = {184--191},
year = {2020},
doi = {10.5281/zenodo.4245400}
}
@inproceedings{StollerDE19_LyricsAlignment_ICASSP,
author = {Daniel Stoller and Simon Durand and Sebastian Ewert},
title = {End-to-end Lyrics Alignment for Polyphonic Music Using an Audio-To-Character Recognition Model},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {181--185},
address = {Brighton, {UK}},
year = {2019},
doi = {10.1109/ICASSP.2019.8683470}
}
@inproceedings{GravesFGS06_CTCLoss_ICML,
author = {Alex Graves and Santiago Fern{\'{a}}ndez and Faustino J. Gomez and J{\"{u}}rgen Schmidhuber},
title = {Connectionist Temporal Classification: {L}abelling Unsegmented Sequence Data with Recurrent Neural Networks},
booktitle = {Proceedings of the International Conference on Machine Learning ({ICML})},
pages = {369--376},
address = {Pittsburgh, Pennsylvania, USA},
year = {2006},
doi = {10.1145/1143844.1143891}
}