@MASTERSTHESIS{ 2016:1711815232, title = {Design and exploration of 3D MPSoCs with on-chip cache support}, year = {2016}, url = "http://tede2.pucrs.br/tede2/handle/tede/6924", abstract = "Advances in semiconductor manufacturing technology have allowed implement the whole computing system into a single chip, which is namely System-on-Chip (SoC). SoCs integrate several processing elements (PE), memory components and I/O devices. This work employs the term Multiprocessor Systems-on-Chip (MPSoCs) to SoCs that integrate several cooperating PEs. The increasing quantity of PEs in an MPSoC demands the use of architectures that provide scalability and concurrent communication. The Network-on-Chip (NoC) that interconnects the system through distributed routers has come to tackle these requirements. The interconnection system must also provide resources to fulfil the communication between PEs and memory modules. Unfortunately, previous works have shown that a single packet-based NoC is not well-suited to provide scalability and low latency for cache supported systems. Additionally, many NoC-based designs lack support for a shared-memory programming model that is an essential requirement for most of the parallel applications. The main contribution of this work is the design and experimental exploration of 3D MPSoCs with on-chip cache support that employ a crossbar-based infrastructure for the cache-coherent memory hierarchy, and a packet-based NoC for inter-processor communication, due to its efficiency in travelling small packets and its benefits to ever-increasing scalability requirements. Experimental results performed on the Gem5 simulator using the ARM’s ISA and PARSEC and NASA NAS benchmarks were conducted under three evaluations scenarios: 1. Main memory evaluation using emerging 3D memory technologies and two traditional desktop memories: Double Data Rate (DDR) and mobile Low Power (LP) DDR. For the plurality of the applications, the emerging 3D memory technologies had less or equal than 10% of runtime execution increase providing significant energy saving when compared with DDR memories; 2. Cache evaluation using five cache architectures and exploring its effects on execution runtime and energy consumption. Three shared L2 cache designs and two private L2 cache design were explored. For the majority of the applications evaluated, the traditional shared L2 design had the lowest execution runtime. However, the private L2 designs showed the lowest energy consumption; 3. Scalability evaluation of the proposed system. Experiments using various sizes of clusters and applications based on message exchange.", publisher = {Pontifícia Universidade Católica do Rio Grande do Sul}, scholl = {Programa de Pós-Graduação em Ciência da Computação}, note = {Faculdade de Informática} }