@proceedings {806, title = {Power Consumption Modeling and Prediction in a Hybrid CPU-GPU-MIC Supercomputer}, volume = {LNCS 9833}, year = {2016}, publisher = {Springer LNCS}, address = {Grenoble, France}, abstract = {Power consumption is a major obstacle for High Performance Computing (HPC) systems in their quest towards the holy grail of ExaFLOP performance. Significant advances in power efficiency have to be made before this goal can be attained and accurate modeling is an essential step towards power efficiency by optimizing system operating parameters to match dynamic energy needs. In this paper we present a study of power consumption by jobs in Eurora, a hybrid CPU-GPU-MIC system installed at the largest Italian data center. Using data from a dedicated monitoring framework, we build a data-driven model of power consumption for each user in the system and use it to predict the power requirements of future jobs. We are able to achieve good prediction results for over 80 \% of the users in the system. For the remaining users, we identify possible reasons why prediction performance is not as good. Possible applications for our predictive modeling results include scheduling optimization, power-aware billing and system-scale power modeling. All the scripts used for the study have been made available on GitHub.}, doi = {10.1007/978-3-319-43659-3_9}, url = {http://arxiv.org/abs/1601.05961}, author = {Alina Sirbu and Ozalp Babaoglu} } @conference {864, title = {Predicting System-level Power for a Hybrid Supercomputer}, booktitle = {2016 International Conference on High Performance Computing Simulation (HPCS)}, year = {2016}, month = {07/2016}, publisher = {IEEE}, organization = {IEEE}, address = {Innsbruck, Austria}, abstract = {For current High Performance Computing systems to scale towards the holy grail of ExaFLOP performance, their power consumption has to be reduced by at least one order of magnitude. This goal can be achieved only through a combination of hardware and software advances. Being able to model and accurately predict the power consumption of large computational systems is necessary for software-level innovations such as proactive and power-aware scheduling, resource allocation and fault tolerance techniques. In this paper we present a 2-layer model of power consumption for a hybrid supercomputer (which held the top spot of the Green500 list on July 2013) that combines CPU, GPU and MIC technologies to achieve higher energy efficiency. Our model takes as input workload information - the number and location of resources that are used by each job at a certain time - and calculates the resulting system-level power consumption. When jobs are submitted to the system, the workload configuration can be foreseen based on the scheduler policies, and our model can then be applied to predict the ensuing system-level power consumption. Additionally, alternative workload configurations can be evaluated from a power perspective and more efficient ones can be selected. Applications of the model include not only power-aware scheduling but also prediction of anomalous behavior.}, doi = {10.1109/HPCSim.2016.7568420}, url = {http://ieeexplore.ieee.org/document/7568420/}, author = {Alina Sirbu and Ozalp Babaoglu} } @article {846, title = {Towards operator-less data centers through data-driven, predictive, proactive autonomics}, journal = {Cluster Computing}, year = {2016}, month = {04/2016}, pages = {1{\textendash}14}, abstract = {Continued reliance on human operators for managing data centers is a major impediment for them from ever reaching extreme dimensions. Large computer systems in general, and data centers in particular, will ultimately be managed using predictive computational and executable models obtained through data-science tools, and at that point, the intervention of humans will be limited to setting high-level goals and policies rather than performing low-level operations. Data-driven autonomics, where management and control are based on holistic predictive models that are built and updated using live data, opens one possible path towards limiting the role of operators in data centers. In this paper, we present a data-science study of a public Google dataset collected in a 12K-node cluster with the goal of building and evaluating predictive models for node failures. Our results support the practicality of a data-driven approach by showing the effectiveness of predictive models based on data found in typical data center logs. We use BigQuery, the big data SQL platform from the Google Cloud suite, to process massive amounts of data and generate a rich feature set characterizing node state over time. We describe how an ensemble classifier can be built out of many Random Forest classifiers each trained on these features, to predict if nodes will fail in a future 24-h window. Our evaluation reveals that if we limit false positive rates to 5 \%, we can achieve true positive rates between 27 and 88 \% with precision varying between 50 and 72 \%. This level of performance allows us to recover large fraction of jobs{\textquoteright} executions (by redirecting them to other nodes when a failure of the present node is predicted) that would otherwise have been wasted due to failures. We discuss the feasibility of including our predictive model as the central component of a data-driven autonomic manager and operating it on-line with live data streams (rather than off-line on data logs). All of the scripts used for BigQuery and classification analyses are publicly available on GitHub.}, doi = {DOI:10.1007/s10586-016-0564-y}, url = {http://link.springer.com/article/10.1007/s10586-016-0564-y}, author = {Alina Sirbu and Ozalp Babaoglu} } @article {801, title = {A Big Data Analyzer for Large Trace Logs}, journal = {Computing}, year = {2015}, doi = {10.1007/s00607-015-0480-7}, url = {http://link.springer.com/article/10.1007/s00607-015-0480-7}, author = {Balliu, Alkida and Olivetti, Dennis and Ozalp Babaoglu and Marzolla, Moreno and Alina Sirbu} } @conference {802, title = {A Holistic Approach to Log Data Analysis in High-Performance Computing Systems: The Case of IBM Blue Gene/Q}, booktitle = {Euro-Par 2015: parallel Processing Workshops, LNCS 9523}, year = {2015}, publisher = {Springer}, organization = {Springer}, doi = {10.1007/978-3-319-27308-2_51}, url = {http://link.springer.com/chapter/10.1007\%2F978-3-319-27308-2_51}, author = {Alina Sirbu and Ozalp Babaoglu} } @conference {803, title = {Towards Data-Driven Autonomics in Data Centers}, booktitle = {IEEE International Conference on Cloud and Autonomic Computing}, year = {2015}, publisher = {IEEE}, organization = {IEEE}, doi = {DOI:10.1109/ICCAC.2015.19}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7312140\&filter\%3DAND\%28p_IS_Number\%3A7312127\%29}, author = {Alina Sirbu and Ozalp Babaoglu} } @conference {807, title = {BiDAl: Big Data Analyzer for Cluster Traces}, booktitle = {Informatika (BigSys workshop)}, year = {2014}, publisher = {GI-Edition Lecture Notes in Informatics}, organization = {GI-Edition Lecture Notes in Informatics}, url = {http://arxiv.org/abs/1410.1309}, author = {Balliu, Alkida and Olivetti, Dennis and Ozalp Babaoglu and Marzolla, Moreno and Alina Sirbu} }