@PHDTHESIS{ 2022:1939307104, title = {A model for automatized data integration in hadoop-based data lakes}, year = {2022}, url = "https://tede2.pucrs.br/tede2/handle/tede/10250", abstract = "The massive amount of data currently generated by our computing systems and devices, known as big data, require specific technologies to be stored, processed, and distributed. Data lakes are architectures to store data of various formats to be queried when necessary, without needing a predefined schema. Data lakes aim to manage big data ecosystems, and most are currently created based on the Hadoop framework. A known challenge related to data lakes is integrating data from different formats. Data integration is a complex task that requires the attention of a specialist, besides being time-consuming and error-prone. However, this task can be facilitated if we use techniques to know the data profile. This thesis develops a model to automate the heterogeneous data integration process in Hadoop-based data lakes. In this sense, we design a method with five phases to help achieve the research objective: Foundation, Implementation, Experimentation, Evaluation, and Final Model. Our main contributions include the findings of three systematic literature reviews, where we deeply discuss themes related to data lakes, big data profiling, and data integration in data lakes, which served as a basis for the development of a model that enables the automatized integration of heterogeneous data in Hadoop-based data lakes, besides the experiments with bioinformatics data.", publisher = {Pontifícia Universidade Católica do Rio Grande do Sul}, scholl = {Programa de Pós-Graduação em Ciência da Computação}, note = {Escola Politécnica} }