@MASTERSTHESIS{ 2020:1896387931, title = {High-level programming abstractions for distributed stream processing}, year = {2020}, url = "http://tede2.pucrs.br/tede2/handle/tede/9916", abstract = "Stream processing applications represent a significant part of today’s software. An increased amount of streaming data is generated every day from various sources (computing devices and applications), which requires to be processed on time. Shared-memory architectures cannot cope with these large-scale processing demands. In High-Performance Computing (HPC), Message Passing Interface (MPI) is the state-of-the-art parallel API (Application Programming Interface) for implementing parallel C/C++ programs. However, the stream parallelism exploitation using MPI is difficult and error-prone to application developers because it exposes low-level details to them, regarding computer architectures and operating systems. Programmers have to deal with implementation mechanisms for data serizalization, process communication and synchronization, fault tolerance, work scheduling, load balancing, and parallelism strategies. Our research work addresses a subset of these challenges and problems providing two high-level programming abstractions for distributed stream processing. First, we created a distributed stream parallelism library called DSPARLIB. It was built as a skeleton library equipped with Farm and Pipeline parallel patterns to provide programming abstractions on top of MPI. Second, we extend the SPAR language and compiler roles to support distributed memory architectures since it is a Domain-Specific Language (DSL) for expressing stream parallelism using C++11 annotation that has been proved to be productive on shared-memory architectures. We managed to make it work without significantly changing the easy of use language syntax and semantics, generating automatic parallel code with SPAR’s compiler using DSPARLIB as the parallel runtime. The experiments were conducted using real-world stream processing applications and testing different cluster configurations. We demonstrated that DSPARLIB provides a simpler API than MPI and a competitive performance. Also, the SPAR’s compiler was able to generate parallel code automatically without performance penalties compared to handwritten codes in DSPARLIB. Finally, with all these high-level programming abstractions implemented, SPAR becomes the first annotation-based language for expressing stream parallelism in C++ programs to support distributed-memory architectures, avoiding significant sequential code refactoring to enable parallel execution on clusters.", publisher = {Pontifícia Universidade Católica do Rio Grande do Sul}, scholl = {Programa de Pós-Graduação em Ciência da Computação}, note = {Escola Politécnica} }