@PHDTHESIS{ 2019:186619792, title = {Fault-tolerance at the management level in many-core systems}, year = {2019}, url = "http://tede2.pucrs.br/tede2/handle/tede/8982", abstract = "The technology nodes reduction enabled the emergence of NoC-based many- cores with dozens to hundreds of processing elements (PEs). Despite the processing power offered by a large number of processors and communication flexibility due to the adoption of NoCs, it is necessary to manage the many-core resources to ensure scalability. The execu- tion of the management tasks requires processing elements reserved exclusively to execute such actions. A centralized approach would induce a significant load to the managers PEs (MPE) in large-scale systems. The adoption of distributed approaches, with MPEs hierar- chically organized, reduces the management load, being the organization adopted in this work. Recent proposals for Many-core System-on-chip (MCSoCs) management focus on different aspects: power, performance, system resources. These management techniques are applied to the systemic level of the MCSoCs. However, in the reviewed works, there is a gap in proposals related to permanent faults in processors with management functions. This Thesis aims to tackle two main problems. First, to treat permanent faults in management processors, developing a set of new techniques so that the MCSoCs continues to oper- ate correctly, without re-executing applications running on it. Second, to solve the single point of failure issue regarding the communication of the MCSoCs with the external world. The original contribution of this Thesis is a distributed MCSoC architecture, with fault recov- ery capability at critical points in the system. The recovery method includes hardware and software modules, fault monitoring, and management recovering. The proposal uses task migration techniques, and heuristics to select the position of the new manager. This Thesis proposes a recovery method when an MPE became faulty. The method is scalable, able to act in systems from dozens up to hundreds of processors. The method is transparent to the applications executing in the MCSoC, with a small execution overhead observed during the management and task migration.", publisher = {Pontif?cia Universidade Cat?lica do Rio Grande do Sul}, scholl = {Programa de P?s-Gradua??o em Ci?ncia da Computa??o}, note = {Escola Polit?cnica} }