
Publications
At ELSA, we aim to inspire and share knowledge within our network and beyond. The collection of publications below provides an overview of both the network’s own output and research we support. Labels distinguish the categories.
Please note that this list makes no claims of being complete. If you have published a paper which is related to ELSA and should be listed, please reach out to our Press and Communications team.
0000
Nguyen, Khanh; Karatzas, Dimosthenis
Federated Document Visual Question Answering: A Pilot Study Proceedings Article
In: Smith, Elisa H. Barney; Liwicki, Marcus; Peng, Liangrui (Ed.): Document Analysis and Recognition – ICDAR 2024, 0000.
@inproceedings{10.1007/978-3-031-70552-6_9,
title = {Federated Document Visual Question Answering: A Pilot Study},
author = {Khanh Nguyen and Dimosthenis Karatzas},
editor = {Elisa H. Barney Smith and Marcus Liwicki and Liangrui Peng},
booktitle = {Document Analysis and Recognition - ICDAR 2024},
abstract = {"An important handicap of document analysis research is that documents tend to be copyrighted or contain private information, which prohibits their open publication and the creation of centralised, large-scale document datasets. Instead, documents are scattered in private data silos, making extensive training over heterogeneous data a tedious task. In this work, we explore the use of a federated learning (FL) scheme as a way to train a shared model on decentralised private document data. We focus on the problem of Document VQA, a task particularly suited to this approach, as the type of reasoning capabilities required from the model can be quite different in diverse domains. Enabling training over heterogeneous document datasets can thus substantially enrich DocVQA models. We assemble existing DocVQA datasets from diverse domains to reflect the data heterogeneity in real-world applications. We explore the self-pretraining technique in this multi-modal setting, where the same data is used for both pretraining and finetuning, making it relevant for privacy preservation. We further propose combining self-pretraining with a Federated DocVQA training method using centralized adaptive optimization that outperforms the FedAvg baseline. With extensive experiments (The code is available at https://github.com/khanhnguyen21006/fldocvqa), we also present a multi-faceted analysis on training DocVQA models with FL, which provides insights for future research on this task. We show that our pretraining strategies can effectively learn and scale up under federated training with diverse DocVQA datasets and tuning hyperparameters is essential for practical document tasks under federation."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Franco, Danilo; Oneto, Luca; Anguita, Davide
Fair Empirical Risk Minimization Revised Proceedings Article
In: Rojas, Ignacio; Joya, Gonzalo; Catala, Andreu (Ed.): Advances in Computational Intelligence, 0000.
@inproceedings{10.1007/978-3-031-43085-5_3,
title = {Fair Empirical Risk Minimization Revised},
author = {Danilo Franco and Luca Oneto and Davide Anguita},
editor = {Ignacio Rojas and Gonzalo Joya and Andreu Catala},
booktitle = {Advances in Computational Intelligence},
abstract = {"Artificial Intelligence is nowadays ubiquitous, thanks to a continuous process of commodification, revolutionizing but also impacting society at large. In this paper, we address the problem of algorithmic fairness in Machine Learning: ensuring that sensitive information does not unfairly influence the outcome of a classifier. We extend the Fair Empirical Risk Minimization framework [10] where the fair risk minimizer is estimated via constrained empirical risk minimization. In particular, we first propose a new, more general, notion of fairness which translates into a fairness constraint. Then, we propose a new convex relaxation with stronger consistency properties deriving both risk and fairness bounds. By extending our approach to kernel methods, we will also show that the proposal empirically over-performs the state-of-the-art Fair Empirical Risk Minimization approach on several real-world datasets."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Šimsa, Štěpán; Šulc, Milan; Uřičář, Michal; Patel, Yash; Hamdi, Ahmed; Kocián, Matěj; Skalický, Matyáš; Matas, Jiří; Doucet, Antoine; Coustaty, Mickael; Karatzas, Dimosthenis
DocILE Benchmark for Document Information Localization and Extraction Proceedings Article
In: Fink, Gernot A.; Jain, Rajiv; Kise, Koichi; Zanibbi, Richard (Ed.): Document Analysis and Recognition – ICDAR 2023, 0000.
@inproceedings{10.1007/978-3-031-41679-8_9,
title = {DocILE Benchmark for Document Information Localization and Extraction},
author = {Štěpán Šimsa and Milan Šulc and Michal Uřičář and Yash Patel and Ahmed Hamdi and Matěj Kocián and Matyáš Skalický and Jiří Matas and Antoine Doucet and Mickael Coustaty and Dimosthenis Karatzas},
editor = {Gernot A. Fink and Rajiv Jain and Koichi Kise and Richard Zanibbi},
booktitle = {Document Analysis and Recognition - ICDAR 2023},
abstract = {"This paper introduces the DocILE benchmark with the largest dataset of business documents for the tasks of Key Information Localization and Extraction and Line Item Recognition. It contains 6.7k annotated business documents, 100k synthetically generated documents, and nearly 1M unlabeled documents for unsupervised pre-training. The dataset has been built with knowledge of domain- and task-specific aspects, resulting in the following key features: (i) annotations in 55 classes, which surpasses the granularity of previously published key information extraction datasets by a large margin; (ii) Line Item Recognition represents a highly practical information extraction task, where key information has to be assigned to items in a table; (iii) documents come from numerous layouts and the test set includes zero- and few-shot cases as well as layouts commonly seen in the training set. The benchmark comes with several baselines, including RoBERTa, LayoutLMv3 and DETR-based Table Transformer; applied to both tasks of the DocILE benchmark, with results shared in this paper, offering a quick starting point for future work. The dataset, baselines and supplementary material are available at https://github.com/rossumai/docile."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Baraldi, Lorenzo; Cocchi, Federico; Cornia, Marcella; Baraldi, Lorenzo; Nicolosi, Alessandro; Cucchiara, Rita
Contrasting Deepfakes Diffusion via Contrastive Learning and Global-Local Similarities Proceedings Article
In: Leonardis, Aleš; Ricci, Elisa; Roth, Stefan; Russakovsky, Olga; Sattler, Torsten; Varol, Gül (Ed.): Computer Vision – ECCV 2024, 0000.
@inproceedings{10.1007/978-3-031-73036-8_12,
title = {Contrasting Deepfakes Diffusion via Contrastive Learning and Global-Local Similarities},
author = {Lorenzo Baraldi and Federico Cocchi and Marcella Cornia and Lorenzo Baraldi and Alessandro Nicolosi and Rita Cucchiara},
editor = {Aleš Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and Gül Varol},
booktitle = {Computer Vision – ECCV 2024},
abstract = {"Discerning between authentic content and that generated by advanced AI methods has become increasingly challenging. While previous research primarily addresses the detection of fake faces, the identification of generated natural images has only recently surfaced. This prompted the recent exploration of solutions that employ foundation vision-and-language models, like CLIP. However, the CLIP embedding space is optimized for global image-to-text alignment and is not inherently designed for deepfake detection, neglecting the potential benefits of tailored training and local image features. In this study, we propose CoDE (Contrastive Deepfake Embeddings), a novel embedding space specifically designed for deepfake detection. CoDE is trained via contrastive learning by additionally enforcing global-local similarities. To sustain the training of our model, we generate a comprehensive dataset that focuses on images generated by diffusion models and encompasses a collection of 9.2 million images produced by using four different generators. Experimental results demonstrate that CoDE achieves state-of-the-art accuracy on the newly collected dataset, while also showing excellent generalization capabilities to unseen image generators. Our source code, trained models, and collected dataset are publicly available at: https://github.com/aimagelab/CoDE."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
König, Matthias; Zhang, Xiyue; Hoos, Holger H.; Kwiatkowska, Marta; Rijn, Jan N.
Automated Design of Linear Bounding Functions for Sigmoidal Nonlinearities in Neural Networks Proceedings Article
In: Bifet, Albert; Davis, Jesse; Krilavičius, Tomas; Kull, Meelis; Ntoutsi, Eirini; Žliobaitė, Indrė (Ed.): Machine Learning and Knowledge Discovery in Databases. Research Track, 0000.
@inproceedings{10.1007/978-3-031-70368-3_23,
title = {Automated Design of Linear Bounding Functions for Sigmoidal Nonlinearities in Neural Networks},
author = {Matthias König and Xiyue Zhang and Holger H. Hoos and Marta Kwiatkowska and Jan N. Rijn},
editor = {Albert Bifet and Jesse Davis and Tomas Krilavičius and Meelis Kull and Eirini Ntoutsi and Indrė Žliobaitė},
booktitle = {Machine Learning and Knowledge Discovery in Databases. Research Track},
abstract = {"The ubiquity of deep learning algorithms in various applications has amplified the need for assuring their robustness against small input perturbations such as those occurring in adversarial attacks. Existing complete verification techniques offer provable guarantees for all robustness queries but struggle to scale beyond small neural networks. To overcome this computational intractability, incomplete verification methods often rely on convex relaxation to over-approximate the nonlinearities in neural networks."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Cappelletti, Silvia; Baraldi, Lorenzo; Cocchi, Federico; Cornia, Marcella; Baraldi, Lorenzo; Cucchiara, Rita
Adapt to Scarcity: Few-Shot Deepfake Detection via Low-Rank Adaptation Proceedings Article
In: Antonacopoulos, Apostolos; Chaudhuri, Subhasis; Chellappa, Rama; Liu, Cheng-Lin; Bhattacharya, Saumik; Pal, Umapada (Ed.): Pattern Recognition, 0000.
@inproceedings{10.1007/978-3-031-78305-0_8,
title = {Adapt to Scarcity: Few-Shot Deepfake Detection via Low-Rank Adaptation},
author = {Silvia Cappelletti and Lorenzo Baraldi and Federico Cocchi and Marcella Cornia and Lorenzo Baraldi and Rita Cucchiara},
editor = {Apostolos Antonacopoulos and Subhasis Chaudhuri and Rama Chellappa and Cheng-Lin Liu and Saumik Bhattacharya and Umapada Pal},
booktitle = {Pattern Recognition},
abstract = {"The boundary between AI-generated images and real photographs is becoming increasingly narrow, thanks to the realism provided by contemporary generative models. Such technological progress necessitates the evolution of existing deepfake detection algorithms to counter new threats and protect the integrity of perceived reality. Although the prevailing approach among deepfake detection methodologies relies on large collections of generated and real data, the efficacy of these methods in adapting to scenarios characterized by data scarcity remains uncertain. This obstacle arises due to the introduction of novel generation algorithms and proprietary generative models that impose restrictions on access to large-scale datasets, thereby constraining the availability of generated images. In this paper, we first analyze how the performance of current deepfake methodologies, based on the CLIP embedding space, adapt in a few-shot situation over four state-of-the-art generators. Being the CLIP embedding space not specifically tailored for the task, a fine-tuning stage is desirable, although the amount of data needed is often unavailable in a data scarcity scenario. To address this issue and limit possible overfitting, we introduce a novel approach through the Low-Rank Adaptation (LoRA) of the CLIP architecture, tailored for few-shot deepfake detection scenarios. Remarkably, the LoRA-modified CLIP, even when fine-tuned with merely 50 pairs of real and fake images, surpasses the performance of all evaluated deepfake detection models across the tested generators. Additionally, when LoRA CLIP is benchmarked against other models trained on 1,000 samples and evaluated on generative models not seen during training it exhibits superior generalization capabilities."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Garcia-Bordils, Sergi; Karatzas, Dimosthenis; Rusiñol, Marçal
Accelerating Transformer-Based Scene Text Detection and Recognition via Token Pruning Proceedings Article
In: Fink, Gernot A.; Jain, Rajiv; Kise, Koichi; Zanibbi, Richard (Ed.): Document Analysis and Recognition – ICDAR 2023, 0000.
@inproceedings{10.1007/978-3-031-41731-3_7,
title = {Accelerating Transformer-Based Scene Text Detection and Recognition via Token Pruning},
author = {Sergi Garcia-Bordils and Dimosthenis Karatzas and Marçal Rusiñol},
editor = {Gernot A. Fink and Rajiv Jain and Koichi Kise and Richard Zanibbi},
booktitle = {Document Analysis and Recognition - ICDAR 2023},
abstract = {"Scene text detection and recognition is a crucial task in computer vision with numerous real-world applications. Transformer-based approaches are behind all current state-of-the-art models and have achieved excellent performance. However, the computational requirements of the transformer architecture makes training these methods slow and resource heavy. In this paper, we introduce a new token pruning strategy that significantly decreases training and inference times without sacrificing performance, striking a balance between accuracy and speed. We have applied this pruning technique to our own end-to-end transformer-based scene text understanding architecture. Our method uses a separate detection branch to guide the pruning of uninformative image features, which significantly reduces the number of tokens at the input of the transformer. Experimental results show how our network is able to obtain competitive results on multiple public benchmarks while running at significantly higher speeds."},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Tito, Ruben Perez
Exploring the role of Text in Visual Question Answering on Natural Scenes and Documents PhD Thesis
0000.
@phdthesis{nokey,
title = {Exploring the role of Text in Visual Question Answering on Natural Scenes and Documents},
author = {Ruben Perez Tito},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Li, Yi; Angelov, Plamen; Suri, Neeraj
Adversarial Attack Detection via Fuzzy Predictions Journal Article
In: IEEE Transactions on Fuzzy Systems, vol. 32, iss. 12, 0000, ISSN: 1063-6706.
@article{nokey,
title = {Adversarial Attack Detection via Fuzzy Predictions},
author = {Yi Li and Plamen Angelov and Neeraj Suri},
url = {https://ssg.lancs.ac.uk/wp-content/uploads/TFS_Oct-2024.pdf},
issn = {1063-6706},
journal = {IEEE Transactions on Fuzzy Systems},
volume = {32},
issue = {12},
abstract = {Image processing using neural networks act as a tool to speed up predictions for users, specifically on large-scale image samples. To guarantee the clean data for training accuracy, various deep learning-based adversarial attack detection techniques have been proposed. These crisp set-based detection methods directly determine whether an image is clean or attacked, while, calculating the loss is nondifferentiable and hinders training through normal back-propagation. Motivated by the recent success in fuzzy systems, in this work, we present an attack detection method to further improve detection performance, which is suitable for any pretrained neural network classifier. Subsequently, the fuzzification network is used to obtain feature maps to produce fuzzy sets of difference degree between clean and attacked images. The fuzzy rules control the intelligence that determines the detection boundaries. Different from previous fuzzy systems, we propose a fuzzy mean-intelligence mechanism with new support and confidence functions to improve fuzzy rule's quality. In the defuzzification layer, the fuzzy prediction from the intelligence is mapped back into the crisp model predictions for images. The loss between the prediction and label controls the rules to train the fuzzy detector. We show that the fuzzy rule-based network learns rich feature information than binary outputs and offer to obtain an overall performance gain. Experiment results show that compared to various benchmark fuzzy systems and adversarial attack detection methods, our fuzzy detector achieves better detection performance over a wide range of images.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Oneto, Luca; Ridella, Sandro; Anguita, Davide
Informed Machine Learning: Excess risk and generalization Journal Article
In: Neurocomputing, vol. 646, 0000.
@article{nokey,
title = {Informed Machine Learning: Excess risk and generalization},
author = {Luca Oneto and Sandro Ridella and Davide Anguita},
url = {https://www.sciencedirect.com/science/article/pii/S0925231225011932?via%3Dihub},
journal = {Neurocomputing},
volume = {646},
abstract = {Machine Learning (ML) has transformed both research and industry by offering powerful models capable of capturing complex phenomena. However, these models often require large, high-quality datasets and may struggle to generalize beyond the distributions on which they are trained. Informed Machine Learning (IML) tackles these challenges by incorporating domain knowledge at various stages of the ML pipeline, thereby reducing data requirements and enhancing generalization. Building on statistical learning theory, we present some theoretical comparison and insights about ML and IML excess risk and generalization performance. We then illustrate how these theoretical insights can be leveraged in practice through some practical examples. Our findings shed some light on the mechanisms and conditions under which IML can outperform traditional ML, offering valuable guidance for effective implementation in real-world settings.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Buselli, Irene; López, Anna Pallarès; Anguita, Davide; Roli, Fabio; Oneto, Luca
Mitigating Unfair Regression in Machine Learning Model Updates Journal Article
In: 2024 International Conference on Machine Learning and Applications (ICMLA), 0000, ISBN: 979-8-3503-7488-9.
@article{nokey,
title = {Mitigating Unfair Regression in Machine Learning Model Updates},
author = {Irene Buselli and Anna Pallarès López and Davide Anguita and Fabio Roli and Luca Oneto},
url = {https://ieeexplore.ieee.org/document/10903280/authors#authors},
isbn = {979-8-3503-7488-9},
journal = {2024 International Conference on Machine Learning and Applications (ICMLA)},
abstract = {Machine learning systems often require updates for various reasons, such as the availability of new data or models and the need to optimize different technical or ethical metrics. Typically, these metrics reflect an average performance rather than sample-wise behavior. Indeed, improvements in metrics like accuracy can introduce negative flips, where the updated model makes errors that the previous model did not make. In certain applications, these negative flips can be perceived by developers or users as a regression in performance, contributing to the hidden technical debt of machine learning systems. Moreover, if the distribution of negative flips is biased with respect to some sensitive attribute (e.g., gender or race), it may be perceived as discrimination, termed unfair regression. In this paper we show, for the first time, the existence of the phenomenon of unfair regression and propose different ethical metrics to measure it. Additionally, we offer two mitigation strategies - one focused on modifying the learning algorithm and one focused on modifying the tuning phase - to address this issue. Our results on real-world datasets confirm the existence of the unfair regression phenomenon and demonstrate the effectiveness of the proposed mitigation strategies.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Vaneet Aggarwal Zhaonian Zhang, Plamen Angelov
Modeling Brain Aging with Explainable Triamese ViT: Towards Deeper Insights into Autism Disorder Bachelor Thesis
0000.
@bachelorthesis{nokey,
title = {Modeling Brain Aging with Explainable Triamese ViT: Towards Deeper Insights into Autism Disorder},
author = {Zhaonian Zhang, Vaneet Aggarwal, Plamen Angelov, Richard Jiang},
url = {https://pubmed.ncbi.nlm.nih.gov/40424106/},
doi = {10.1109/JBHI.2025.3574366},
journal = {IEEE Journal of Biomedical and Health Infomatics},
abstract = {Machine learning, particularly through advanced imaging techniques such as three-dimensional Magnetic Resonance Imaging (MRI), has significantly improved medical diagnostics. This is especially critical for diagnosing complex conditions like Alzheimer's disease. Our study introduces Triamese-ViT, an innovative Tri-structure of Vision Transformers (ViTs) that incorporates a built-in interpretability function, it has structure-aware explainability that allows for the identification and visualization of key features or regions contributing to the prediction, integrates information from three perspectives to enhance brain age estimation. This method not only increases accuracy but also improves interoperability with existing techniques. When evaluated, Triamese-ViT demonstrated superior performance and produced insightful attention maps. We applied these attention maps to the analysis of natural aging and the diagnosis of Autism Spectrum Disorder (ASD). The results aligned with those from occlusion analysis, identifying the Cingulum, Rolandic Operculum, Thalamus, and Vermis as important regions in normal aging, and highlighting the Thalamus and Caudate Nucleus as key regions for ASD diagnosis.},
keywords = {},
pubstate = {published},
tppubtype = {bachelorthesis}
}
Plamen Angelov Yi Li, Neeraj Suri
Rethinking Self-supervised Learning for Cross-domain Adversarial Sample Recovery Proceedings
0000, ISSN: 2161-4407.
@proceedings{nokey,
title = {Rethinking Self-supervised Learning for Cross-domain Adversarial Sample Recovery},
author = {Yi Li, Plamen Angelov, Neeraj Suri},
url = {https://ieeexplore.ieee.org/document/10650687},
issn = {2161-4407},
issue = {IJCNN 2024},
abstract = {Adversarial attacks can cause misclassification in machine learning pipelines, posing a significant safety risk in critical applications such as autonomous systems or medical applications. Supervised learning-based methods for adversarial sample recovery rely heavily on large volumes of labeled data, which often results in substantial performance degradation when applying the trained model to new domains. In this paper, differing from conventional self-supervised learning techniques such as data augmentation, we present a novel two-stage self-supervised representation learning framework for the task of adversarial sample recovery, aimed at overcoming these limitations. In the first stage, we employ a clean image autoencoder (CAE) to learn representations of clean images. Subsequently, the second stage utilizes an adversarial image autoencoder (AAE) to learn a shared latent space that captures the relationships between the representations acquired by CAE and AAE. It is noteworthy that the input clean images in the first stage and adversarial images in the second stage are cross-domain and not paired. To the best of our knowledge, this marks the first instance of self-supervised adversarial sample recovery work that operates without the need for labeled data. Our experimental evaluations, spanning a diverse range of images, consistently demonstrate the superior performance of the proposed method compared to conventional adversarial sample recovery methods.},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Peter Mostowsky, et al
0000.
@online{nokey,
title = {The GeometricKernels Package: Heat and Mat'ern Kernels for Geometric Learning on Manifolds, Meshes, and Graphs},
author = { Mostowsky, Peter, et al},
url = {https://github.com/geometric-kernels/GeometricKernels},
abstract = {GeometricKernels is a library that implements kernels — most importantly, heat and Matérn kernels — on non-Euclidean spaces such as Riemannian manifolds, graphs and meshes. This enables kernel methods — in particular Gaussian process models — to be deployed on such spaces.},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
Kangin, Bernard Tomczyk; Plamen Angelov; Dmitry
Machine Learning within Latent Spaces formed by Foundation Models Proceedings Article
In: 2024 IEEE 12th International Conference on Intelligent Systems (IS), pp. 1-10, IEEE, 0000, ISBN: 979-8-3503-5098-2.
@inproceedings{nokey,
title = {Machine Learning within Latent Spaces formed by Foundation Models},
author = {Bernard Tomczyk; Plamen Angelov; Dmitry Kangin},
url = {https://ieeexplore.ieee.org/abstract/document/10705264},
doi = {10.1109/IS61756.2024.10705264},
isbn = {979-8-3503-5098-2},
booktitle = {2024 IEEE 12th International Conference on Intelligent Systems (IS)},
pages = {1-10},
publisher = {IEEE},
abstract = {Foundation Models (FM) developed on very large generic data sets transformed the landscape of machine learning (ML). Vision transformers (ViT) closed the gap in performance between fine-tuned and unsupervised transfer learning. This opens the possibility to abandon the widely used until recently end-to-end approach. Instead, we consider a two-stage ML pipeline, where the first stage constitutes extracting features by pre-training large, multi-layer model with billions of parameters, and the second stage is a computationally lightweight learning of an entirely new, simpler model architecture based on prototypes within this feature space. In this paper we consider such two-stage approach to ML. We further analyse the use of several alternative light-weight methods in the second stage, including strategies for semi-supervised learning and a variety of strategies for linear fine-tuning. We demonstrate on the basis of nine well known benchmark data sets that the ultra-light-weight ML alternatives for the second stage (such as clustering, PCA, LDA and combinations of these) offer for the price of negligible drop in accuracy a significant (several orders of magnitude) drop of computational costs (time, energy and related CO2 emissions) as well as the ability to use no labels (fully unsupervised approach) or limited amount of labels (one per cluster labels) and the ability to address interpretability.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
