@inproceedings{03ec12360673451e9e0429b9f93836a2,
title = "Defoe: A spark-based toolbox for analysing digital historical textual data",
abstract = "This work presents defoe, a new scalable and portable digital eScience toolbox that enables historical research. It allows for running text mining queries across large datasets, such as historical newspapers and books in parallel via Apache Spark. It handles queries against collections that comprise several XML schemas and physical representations. The proposed tool has been successfully evaluated using five different large-scale historical text datasets and two HPC environments, as well as on desktops. Results shows that defoe allows researchers to query multiple datasets in parallel from a single command-line interface and in a consistent way, without any HPC environment-specific requirements.",
keywords = "Apache Spark, Digital tools, Distributed queries, High-Performance Computing, Historical sources, Humanities research, Text mining, XML schemas",
author = "Rosa Filgueira and {Coll Ardanuy}, Mariona and Giovanni Colavizza and James Hetherington and Melissa Terras and Michael Jackson and Anna Roubickova and Amrey Krause and Ruth Ahnert and Tessa Hauswedell and Julianne Nyhan and David Beavan and Timothy Hobson",
note = "Funding Information: ACKNOWLEDGEMENTS This work was funded by Scottish Enterprise as part of the Alan Turing Institute-Scottish Enterprise Data Engineering Programme, and by AHRC as part of the Living with Machines via the Strategic Priorities Fund. Publisher Copyright: {\textcopyright} 2019 IEEE.; 15th IEEE International Conference on eScience, eScience 2019 ; Conference date: 24-09-2019 Through 27-09-2019",
year = "2019",
month = sep,
doi = "10.1109/eScience.2019.00033",
language = "English",
series = "Proceedings - IEEE 15th International Conference on eScience, eScience 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "235--242",
booktitle = "Proceedings - IEEE 15th International Conference on eScience, eScience 2019",
address = "United States",
}