@PHDTHESIS{ 2015:41464901, title = {Application-aware software-defined networking to accelerate mapreduce applications}, year = {2015}, url = "http://tede2.pucrs.br/tede2/handle/tede/5276", abstract = "The rise of Internet of Things sensors, social networking and mobile devices has led to an explosion of available data. Gaining insights into this data has led to the area of Big Data analytics. The MapReduce (MR) framework, as implemented in Hadoop, has become the de facto standard for Big Data analytics. It also forms a base platform for a plurality of Big Data technologies that are used today. To handle the ever-increasing data size, Hadoop is a scalable framework that allows dedicated, seemingly unbound numbers of servers to participate in the analytics process. Response time of an analytics request is an important factor for time to value/insights. While the compute and disk I/O requirements can be scaled with the number of servers, scaling the system leads to increased network traffic. Arguably, the communication-heavy phase of MR contributes significantly to the overall response time. This problem is further aggravated, if communication patterns are heavily skewed, as is not uncommon in many MR workloads. MR applications normally run in large data centers (DCs) employing dense network topologies (e.g. multi-rooted trees) with multiple paths available between any pair of hosts. These DC network designs, combined with recent software-defined network (SDN) programmability, offer a new opportunity to dynamically and intelligently configure the network to achieve shorter application runtime. The initial intuition motivating our work is that the well-defined structure of MR and the rich traffic demand information available in Hadoop s log and meta-data files could be used to guide the network control. We therefore conjecture that an application-aware network control (i.e., one that knows the applicationlevel semantics and traffic demands) can improve MR applications performance when compared to state-of-the-art application-agnostic network control. To confirm our thesis, we first studied MR systems in detail and identified typical communication patterns and common causes of network-related performance bottlenecks in MR applications. Then, we studied the state of the art in DC networks and evaluated its ability to handle MapReduce-like communication patterns. Our results confirmed the assumption that existing techniques are not able to deal with MR communication patterns mainly because of the lack of visibility of application-level information. Based on these findings, we proposed an architecture for an application-aware network control for DCs running MR applications. We implemented a prototype within a SDN controller and used it to successfully accelerate MR applications. Depending on the network oversubscription ratio, we demonstrated a 2% to 58% reduction in the job completion time for popular MR benchmarks, when compared to ECMP (the de facto flow allocation algorithm in multipath DC networks), thus, confirming the thesis. Other contributions include a method to predict network demands in MR applications, algorithms to identify the critical communication path in MR shuffle and dynamically alocate paths to flows in a multipath network, and an emulation-based testbed for realistic MR workloads.", publisher = {Pontifícia Universidade Católica do Rio Grande do Sul}, scholl = {Programa de Pós-Graduação em Ciência da Computação}, note = {Faculdade de Informáca} }