diff --git a/.wordlist.txt b/.wordlist.txt index bfbe6e5d9..067b70e4a 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -36,6 +36,7 @@ DockerHub dP dt Elgg +entrypoint epfl ethernet faban @@ -43,8 +44,10 @@ FastCGI fb filesystem fpm +frontend Gbit GBs +GC github GraphX grouplens @@ -61,6 +64,7 @@ IPAddress JVM JIT keyspace +LLC localhost login MapReduce @@ -74,6 +78,7 @@ MemcacheServer memcacheserverdocker metadata middleware +microarchitectures MLlib Moby Movielens @@ -101,11 +106,13 @@ qemu QEMU QoS README +realtime Recommender RECORDCOUNT repo runtime scalability +Skylake slavedocker solr solr's @@ -121,9 +128,11 @@ UI usertable usr uwsgi +vectorization videoperf VM VMs +warmup WebServer webserverdocker webserverdocker diff --git a/LICENSE.md b/LICENSE.md index c592340b8..1c3e185f1 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -3,7 +3,6 @@ CloudSuite consists of several software components that are governed by various ### Software developed externally (not by the CloudSuite group) * [Nginx Web Server](http://nginx.org/LICENSE) -* [MySQL DBMS](http://www.gnu.org/licenses/gpl.html) * [PHP](http://www.php.net/license/3_01.txt) * [APC (Alternative PHP Cache)](http://www.php.net/license/3_01.txt) * [Nutch](http://www.apache.org/licenses/LICENSE-2.0) @@ -24,12 +23,12 @@ CloudSuite consists of several software components that are governed by various * [Elgg](https://www.gnu.org/licenses/gpl-2.0.html) ### Software developed internally (by the CloudSuite group) -**CloudSuite 3.0 License** +**CloudSuite 4.0 License** -CloudSuite 3.0 Benchmark Suite +CloudSuite 4.0 Benchmark Suite -Copyright © 2011-2018, Parallel Systems Architecture Lab, EPFL +Copyright © 2011-2023, Parallel Systems Architecture Lab, EPFL All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/benchmarks/data-caching/client/README.md b/benchmarks/data-caching/client/README.md deleted file mode 100644 index c5db64812..000000000 --- a/benchmarks/data-caching/client/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Memcached Client # - -This `Dockerfile` creates a docker image representing the Memcached client which tries to access server's data. - -Example: - - $ docker pull cloudsuite/data-caching:client - $ docker run -it --name dc-client --net host cloudsuite/data-caching:client bash diff --git a/benchmarks/data-caching/server/README.md b/benchmarks/data-caching/server/README.md deleted file mode 100644 index da7300b2c..000000000 --- a/benchmarks/data-caching/server/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Memcached Server # - -This `Dockerfile` creates a docker image containing the latest version of Memcached (1.6.10). -Memcached will be started as a daemon with the passed parameters. -Example: - - $ docker pull cloudsuite/data-caching:server - $ docker run --name dc-server --net host -d cloudsuite/data-caching:server -t 4 -m 4096 -n 550 diff --git a/benchmarks/web-search/client/docker-entrypoint.py b/benchmarks/web-search/client/docker-entrypoint.py index 9f3b1756a..b4f408815 100755 --- a/benchmarks/web-search/client/docker-entrypoint.py +++ b/benchmarks/web-search/client/docker-entrypoint.py @@ -12,7 +12,7 @@ arg.add_argument("--interval-max", type=int, help="The maximum interval for request generation, in milliseconds", default=1500) arg.add_argument("--interval-deviation", type=float, help="The deviation of the interval, in percentage.", default=0) arg.add_argument("--interval-type", choices=["ThinkTime", "CycleTime"], help="The interval type.", default="ThinkTime") -arg.add_argument("--interval-distribution", choices=["Fixed", "Uniform", "NegativeExponential"], help="The distribution of interval", default="Fixed") +arg.add_argument("--interval-distribution", choices=["Fixed", "Uniform", "NegExp"], help="The distribution of interval", default="Fixed") arg.add_argument("--dataset-distribution", choices=["Random", "Zipfian"], help="The distribution of the request", default="Zipfian") arg.add_argument("--output-query-result", "-q", action="store_true", help="Whether let Faban output search query. Can be a potential performance bottleneck.") @@ -51,7 +51,7 @@ arg.interval_deviation )) if arg.interval_min != arg.interval_max: - print("Warning: the maximal interval should be same as the minimal interval when fixed distribution is used. The program uses minimal interval as the fixed interval.") + print("Warning: the maximum interval should be same as the minimum interval when fixed distribution is used. The program uses minimum interval as the fixed interval.") elif arg.interval_distribution == "Uniform": f.write("@Uniform(cycleMin = {}, cycleMax = {}, cycleType = CycleType.{}, cycleDeviation = {})\n".format( arg.interval_min, @@ -59,7 +59,7 @@ arg.interval_type.upper(), arg.interval_deviation )) - elif arg.interval_distribution == "NegativeExponential": + elif arg.interval_distribution == "NegExp": f.write("@NegativeExponential(cycleMin = {}, cycleMax = {}, cycleMean = {}, cycleType = CycleType.{}, cycleDeviation = {})\n".format( arg.interval_min, arg.interval_max, diff --git a/docs/benchmarks/data-analytics.md b/docs/benchmarks/data-analytics.md index d5778ef6a..eb75897b9 100644 --- a/docs/benchmarks/data-analytics.md +++ b/docs/benchmarks/data-analytics.md @@ -3,22 +3,19 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -The explosion of accessible human-generated information necessitates automated analytical processing to cluster, classify, and filter this information. Hadoop has emerged as a popular approach to handling large-scale analysis with its distributed file system and compute capabilities, allowing it to scale to PetaBytes of data. The Data Analytics benchmark is included in CloudSuite to cover the increasing importance of classification tasks analyzing large amounts of data in datacenters using the MapReduce framework. It is composed of Mahout, a set of machine learning libraries, running on top of Hadoop, an open-source implementation of MapReduce. +The explosion of human-generated information necessitates automated analytical processing to cluster, classify, and filter this information.The Data Analytics benchmark is included in CloudSuite to cover the increasing importance of classification tasks in analyzing large amounts of data in datacenters. It uses the MapReduce framework Hadoop, which is a popular approach for handling large-scale analysis. Its distributed file system and compute capabilities allow it to scale to PetaBytes of data. -The benchmark consists of running a Naive Bayes classifier on a (Wikimedia dataset)[https://dumps.wikimedia.org/backup-index.html]. It uses Hadoop version 2.10.2 and Mahout version 14.1. +This workload is based on Mahout, a set of machine learning libraries running on top of Hadoop. It runs a Naive Bayes classifier on a [Wikimedia dataset](https://dumps.wikimedia.org/backup-index.html), and uses Hadoop version 2.10.2 and Mahout version 14.1. -## Images ## -To obtain the images: +## Dockerfiles -```bash -$ docker pull cloudsuite/data-analytics -$ docker pull cloudsuite/wikimedia-pages-dataset -``` +Supported tags and their respective `Dockerfile` tags: +- [`latest`][latestcontainer] contains the application logic. ## Running the benchmark ## -The benchmark is designed to run on a Hadoop cluster, where the single master runs the driver program, and the slaves run the mappers and reducers. +The benchmark is designed to run on a Hadoop cluster, where a single master runs the driver program, and workers run the mappers and reducers. First, start the container for the dataset: @@ -26,18 +23,17 @@ First, start the container for the dataset: $ docker create --name wikimedia-dataset cloudsuite/wikimedia-pages-dataset ``` -**Note**: The following commands will start the master for the cluster. To make sure that slaves and master can communicate with each other, the slave container's must point to the master's IP address. - Start the master with: ```bash $ docker run -d --net host --volumes-from wikimedia-dataset --name data-master cloudsuite/data-analytics --master ``` -By default, Hadoop master node is listened on the first interface accessing to network . You can overwrite the listening address by adding `--master-ip=X.X.X.X` to change the setting. +By default, the Hadoop master node is listening on the first interface accessing the network. You can overwrite the listening address by adding `--master-ip=X.X.X.X`. -Start any number of Hadoop slaves with: -``` +Start any number of Hadoop workers with: + +```bash $ # on VM1 $ docker run -d --net host --name data-slave01 cloudsuite/data-analytics --slave --master-ip= @@ -46,9 +42,10 @@ $ docker run -d --net host --name data-slave02 cloudsuite/data-analytics --slave ... ``` -**Note**: You should set `IP_ADDRESS_MASTER` to master's IP address. -After both master and slave are set up (you can use `docker logs` to observe if the log is still generating), run the benchmark with: +**Note**: You should set `IP_ADDRESS_MASTER` to the master's IP address and make sure that address is accessible from each worker. + +After both master and worker are set up (you can use `docker logs` to observe if the log is still being updated), run the benchmark with the following command: ```bash $ docker exec data-master benchmark @@ -56,17 +53,20 @@ $ docker exec data-master benchmark ### Configuring Hadoop parameters ### -We can configure a few parameters for Hadoop depending on requirements. +A few parameters for Hadoop can be configured depending on requirements. -Hadoop infers the number of workers with how many partitions it created with HDFS. We can increase or reduce the HDFS partition size to `N` mb with `--hdfs-block-size=N`, 128mb being the default. The current dataset included here weights 900MB, thus the default `--hdfs-block-size=128` of 128mb resulting in splits between 1 and 8 parts depending on the benchmark phase. +Hadoop infers the number of workers based on how many partitions it created with HDFS (HaDoop File System, a distributed file system for handing out dataset chunks to workers). You can increase or reduce the HDFS partition size to `N` MB with `--hdfs-block-size=N`, with 128MB being the default. The default dataset weighs 900MB. Thus, depending on the benchmark phase (sequencing, vectorization, pre-training, training, and inference), the default option `--hdfs-block-size=128` results in a split between 1 and 8 parts. -The maximum number of workers is configured by `--yarn-cores=C`, default is 8, if there's more splits than number of workers, YARN will only allow up to `C` workers threads to process them and multiplex the tasks. Please note that **at least 2 cores** should be given for all workers in total: One core for the map operation and another core for the reduce operation. Otherwise, the process can get stuck. +Hadoop relies on [YARN][yarn] (Yet Another Resource Negotiator) to manage its resources, and the maximum number of workers is configured by `--yarn-cores=C`, whose default value is 8. If there are more blocks than the number of workers, YARN will only allow up to `C` worker threads to process them. Please note that **at least two cores** should be given in total: One core for the map operation and another for the reduce operation. Otherwise, the process can get stuck. -The maximum memory used by each worker is configured by `--mapreduce-mem=N`, default is 2096mb. Note that depending on the number of `--yarn-cores=C`, the total actual physical memory required will be of at least `C*N`. You are recommended to allocate 8GB memory (even for single worker with 2 CPUs) in total to avoid out of memory errors. +The maximum memory used by each worker is configured by `--mapreduce-mem=N`, and the default value is 2096MB. Note that depending on the number of `--yarn-cores=C`, the total physical memory required will be at least `C*N`. To avoid out-of-memory errors, we recommend allocating at least 8GB of memory (even for a single worker with two cores) in total. -For increasing total number of workers, please use a bigger dataset from wikimedia. Using a smaller partition sizes than 128 mb will result in increasing number of workers but also will actually slowdown the execution due to overheads of small partition size. +To increase the number of workers, please use a bigger dataset from Wikimedia. Using partition sizes smaller than 128MB can increase the number of workers but slow down the execution due to overheads of the small partition size. [dhrepo]: https://hub.docker.com/r/cloudsuite/data-analytics/ "DockerHub Page" [dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/data-analytics.svg "Go to DockerHub Page" [dhstars]: https://img.shields.io/docker/stars/cloudsuite/data-analytics.svg "Go to DockerHub Page" +[yarn]: https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html "YARN explanation" + +[latestcontainer]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/data-analytics/latest/Dockerfile "link to container, github" \ No newline at end of file diff --git a/docs/benchmarks/data-caching.md b/docs/benchmarks/data-caching.md index 3f32edd59..96e11b78c 100644 --- a/docs/benchmarks/data-caching.md +++ b/docs/benchmarks/data-caching.md @@ -3,124 +3,115 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -This benchmark uses the [Memcached][memcachedWeb] data caching server, -simulating the behavior of a Twitter caching server using a twitter dataset. -The metric of interest is throughput expressed as the number of requests served per second. -The workload assumes strict quality of service guarantees. +This benchmark uses the Memcached data caching server. It simulates the behavior of a Twitter data caching server using a Twitter dataset. The metric of interest is throughput, expressed as the number of requests served per second. The workload assumes a strict Quality of Service (QoS) guarantee: the 99 percentile latency should be less than 1ms. ## Using the benchmark ## -This benchmark features two tiers: the server(s), running Memcached, and the client(s), which request data cached on the Memcached servers. Each tier has its own image which is identified by its tag. +This benchmark features two tiers: the server(s) running Memcached and the client(s) requesting data cached on the Memcached servers. Each tier has its own image, identified by its tag. ### Dockerfiles ### Supported tags and their respective `Dockerfile` links: - - [`server`][serverdocker]: This represents the Memcached server running as a daemon. - - [`client`][clientdocker]: This represents the client which requests to access the server's data. - -These images are automatically built using the mentioned Dockerfiles available on `parsa-epfl/cloudsuite` [GitHub repo][repo]. + - [`server`][serverdocker] represents the Memcached server running as a daemon. + - [`client`][clientdocker] represents the client which requests to access the server's data. ### Starting the Server #### -To start the server you have to first `pull` the server image and then run it. To `pull` the server image use the following command: - $ docker pull cloudsuite/data-caching:server +The following command will start a single Memcached server with four threads and 10GB of dedicated memory, with a minimum object size of 550 bytes listening on port 11211 as default: -It takes some time to download the image, but this is only required the first time. -The following command will start the server with four threads and 10GB of dedicated memory, with a minimal object size of 550 bytes listening on port 11211 as default: +```bash +$ docker run --name dc-server --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 +``` - $ docker run --name dc-server --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 +You may also set up multiple Memcached server instances using the following commands: - The following commands create Memcached server instances: +```bash +# on VM1 +$ docker run --name dc-server1 --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 - $ # on VM1 - $ docker run --name dc-server1 --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 +# on VM2 +$ docker run --name dc-server2 --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 - $ # on VM2 - $ docker run --name dc-server2 --net host -d cloudsuite/data-caching:server -t 4 -m 10240 -n 550 - ... - +# ... +``` ### Starting the Client #### -To start the client you have to first `pull` the client image and then run it. To `pull` the server image use the following command: - - $ docker pull cloudsuite/data-caching:client - -It takes some time to download the image, but this is only required the first time. - -Create an empty folder and then create the server configuration file, named `docker_servers.txt` inside the folder. This file includes the server address and the port number to connect to, in the following format: - +Create an empty folder and then create a server configuration file named docker_servers.txt inside the folder. This file includes the server address and the port number to connect to, in the following format: +``` server_address, port - -The client can simultaneously use multiple servers, one server with several ip addresses (in case the server machine has multiple ethernet cards active), and one server through multiple ports, measuring the overall throughput and quality of service. In that case, each line in the configuration file should contain the server address and the port number. To illustrate, in the case of our example it should be: - +``` +The client can simultaneously use multiple servers or one server with several IP addresses (in case the server machine has multiple ethernet cards active) or one server through multiple ports, while measuring the overall throughput and QoS. As a result, each line in the configuration file should contain the corresponding server address and port number. To illustrate, in the case of our example, it would be: +``` IP_ADDRESS_VM1, 11211 IP_ADDRESS_VM2, 11211 ... +``` +To start the client container, use the following command: -To start the client container use the following command: - - $ docker run -idt --name dc-client --net host -v PATH_TO_DOCKER_SERVERS_FOLDER:/usr/src/memcached/memcached_client/docker_servers/ cloudsuite/data-caching:client +```bash +$ docker run -idt --name dc-client --net host -v PATH_TO_DOCKER_SERVERS_FOLDER:/usr/src/memcached/memcached_client/docker_servers/ cloudsuite/data-caching:client +``` -Please note that the command mounts the folder containing the 'docker_servers.txt' file instead of mounting only the file. This way, further changes to the docker_servers.txt file in the host will be reflected inside of the container. +Please note that the command mounts the folder containing the docker_servers.txt file instead of only the file. This way, further changes to docker_servers.txt in the host will be reflected inside the container. #### Scaling the dataset and warming up the server #### -The following command will create the dataset by scaling up the Twitter dataset, while preserving both the popularity and object size distributions. The original dataset consumes ~360MB of server memory, while the recommended scaled dataset requires around 10GB of main memory dedicated to the Memcached server. Therefore, we use a scaling factor of 28 to have a 10GB dataset. +The following command will create the dataset by scaling up the Twitter dataset while preserving both popularity and object size distributions. The original dataset consumes ~360MB of server memory, while the recommended scaled dataset requires around 10GB of main memory dedicated to the Memcached server. Therefore, we use a scaling factor of 28 to have a 10GB dataset. - $ docker exec -it dc-client /bin/bash /entrypoint.sh --m="S&W" --S=28 --D=10240 --w=8 --T=1 - -(`m` - the mode of operation, `S&W` means scale the dataset and warm up the server, `w` - number of client threads which has to be divisible to the number of servers, `S` - scaling factor, `D` - target server memory, `T` - statistics interval). +```bash +$ docker exec -it dc-client /bin/bash /entrypoint.sh --m="S&W" --S=28 --D=10240 --w=8 --T=1 +``` -If the scaled file is already created, but the server is not warmed up, use the following command to warm up the server. `W` refers to the _warm up_ mode of operation. +(`m` - the mode of operation, `S&W` means scale the dataset and warm up the server, `w` - number of client threads which has to be divisible by the number of servers, `S` - scaling factor, `D` - target server memory, `T` - statistics interval). - $ docker exec -it dc-client /bin/bash /entrypoint.sh --m="W" --S=28 --D=10240 --w=8 --T=1 +If the scaled file already exists, but the server is not warmed up, use the following command to warm up the server. `W` refers to the _warm-up_ mode of operation. +```bash +$ docker exec -it dc-client /bin/bash /entrypoint.sh --m="W" --S=28 --D=10240 --w=8 --T=1 +``` ### Running the benchmark ### -To determine the maximum throughput while running the workload with eight client threads, +To determine the maximum throughput while running the workload with 8 client threads, 200 TCP/IP connections, and a get/set ratio of 0.8, use the following command. `TH` refers to the _throughput_ mode of operation. - $ docker exec -it dc-client /bin/bash /entrypoint.sh --m="TH" --S=28 --g=0.8 --c=200 --w=8 --T=1 +```bash +$ docker exec -it dc-client /bin/bash /entrypoint.sh --m="TH" --S=28 --g=0.8 --c=200 --w=8 --T=1 +``` -This command will run the benchmark with the maximum throughput; however, the QoS requirements will highly likely be violated. Once the maximum throughput is determined, you should run the benchmark using the following command. `RPS` means that the client container will keep the load at the given load (requests per second). +This command will run the benchmark with maximum throughput; however, the QoS requirements will likely be violated. Once the maximum throughput is determined, run the benchmark using the following command. `RPS` means the target load supplied by the client container. - $ docker exec -it dc-client /bin/bash /entrypoint.sh --m="RPS" --S=28 --g=0.8 --c=200 --w=8 --T=1 --r=rps +```bash +$ docker exec -it dc-client /bin/bash /entrypoint.sh --m="RPS" --S=28 --g=0.8 --c=200 --w=8 --T=1 --r=rps +``` -where `rps` is 90% of the maximum number of requests per second achieved using the previous command. You should experiment with different values of `rps` to achieve the maximum throughput without violating the target QoS requirements. By default, the request interval is fixed. You can add `--ne` to make the interval follow negative exponential distribution. +Where `rps` can start from the 90% of the maximum number of requests per second achieved using the previous command. It would be best to experiment with different `rps` values to achieve the maximum throughput without violating the target QoS requirements. By default, the request interval is fixed. You can add the `--ne` flag to make the interval follow a negative exponential distribution. -Note that the last two commands will continue forever if you do not stop or kill the command. For running the command for a given amount of time, you can use the timeout command. The following example will run the benchmark in the `RPS` mode for 20 seconds: +Note that the last two commands will continue forever if you do not stop or kill the command. You can use the timeout command to run a command for a given amount of time. The following example will run the benchmark in `RPS` mode for 20 seconds: - $ docker exec -it dc-client timeout 20 /bin/bash /entrypoint.sh --m="RPS" --S=28 --g=0.8 --c=200 --w=8 --T=1 --r=100000 +```bash +$ docker exec -it dc-client timeout 20 /bin/bash /entrypoint.sh --m="RPS" --S=28 --g=0.8 --c=200 --w=8 --T=1 --r=100000 +``` ## Important remarks ## -- It takes several minutes for the server to reach a stable state. - - The target QoS requires that 99% of the requests are serviced within 1ms. - Memcached has known scalability problems, scaling very poorly beyond four threads. To utilize a machine with more than four cores, you should start several server processes and add the corresponding parameters into the client configuration file. -- The benchmark is network-intensive and thus requires a 10Gbit Ethernet card -not to be network-bound. Multiple ethernet cards could be used as well, -each with a different IP address (two servers in the client configuration file -with the same socket, but different IP address). -Multisocket machines could also mitigate the network bandwidth limitations by running the server -and the client on different sockets of the same machine -(e.g., pinned using taskset), communicating via localhost. - +- The benchmark is network-intensive and thus requires a 10Gbit Ethernet card to not be network-bound. Multiple ethernet cards could be used as well, each with a different IP address, resulting in multiple servers in the client configuration file with the same socket but different IP addresses. - [memcachedWeb]: http://memcached.org/ "Memcached Website" +[memcachedWeb]: http://memcached.org/ "Memcached Website" - [serverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/master/benchmarks/data-caching/server/Dockerfile "Server Dockerfile" +[serverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/master/benchmarks/data-caching/server/Dockerfile "Server Dockerfile" - [clientdocker]: https://github.com/parsa-epfl/cloudsuite/blob/master/benchmarks/data-caching/client/Dockerfile "Client Dockerfile" +[clientdocker]: https://github.com/parsa-epfl/cloudsuite/blob/master/benchmarks/data-caching/client/Dockerfile "Client Dockerfile" - [repo]: https://github.com/parsa-epfl/cloudsuite "GitHub Repo" - [dhrepo]: https://hub.docker.com/r/cloudsuite/data-caching/ "DockerHub Page" - [dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/data-caching.svg "Go to DockerHub Page" - [dhstars]: https://img.shields.io/docker/stars/cloudsuite/data-caching.svg "Go to DockerHub Page" +[repo]: https://github.com/parsa-epfl/cloudsuite "GitHub Repo" +[dhrepo]: https://hub.docker.com/r/cloudsuite/data-caching/ "DockerHub Page" +[dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/data-caching.svg "Go to DockerHub Page" +[dhstars]: https://img.shields.io/docker/stars/cloudsuite/data-caching.svg "Go to DockerHub Page" diff --git a/docs/benchmarks/data-serving.md b/docs/benchmarks/data-serving.md index 12ae6e486..48151820c 100644 --- a/docs/benchmarks/data-serving.md +++ b/docs/benchmarks/data-serving.md @@ -2,34 +2,31 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -The data serving benchmark relies on the Yahoo! Cloud Serving Benchmark (YCSB). YCSB is a framework to benchmark data store systems. This framework comes with appropriate interfaces to populate and stress many popular data serving systems. Here we provide the instructions and pointers to download and install YCSB and use it with the Cassandra data store. - -## Generating Datasets - -The YCSB client has a data generator. After starting Cassandra, YCSB can start loading the data. First, you need to create a keyspace named *usertable* and a column family for YCSB. This is a must for YCSB to load data and run. - +The Data Serving benchmark relies on the Yahoo! Cloud Serving Benchmark (YCSB). YCSB is a framework to benchmark data storage systems. This framework has an appropriate interface to populate and stress many popular database management systems. This benchmark loads one of the most popular NoSQL databases: Cassandra, with YCSB to mimic a representative NoSQL database state in the cloud. +### Dockerfiles +Supported tags and their respective `Dockerfile` links: + - [`server`][serverdocker] contains Cassandra and the script to initialize its configuration. + - [`client`][clientdocker] contains the YCSB load generator. ### Server Container -**Note**: The following commands will run the Cassandra within host's network. To make sure that slaves and master can communicate with each other, the master container's hostname, which should be host's hostname, must be able to be resolved to the same IP address by the master container and all slave containers. - -Start the server container that will run cassandra server and installs a default keyspace usertable: +Start the server container that will run a Cassandra server: ```bash $ docker run --name cassandra-server --net host cloudsuite/data-serving:server ``` -The following options can be used to modify the setting of the server: -- `--listen-ip=`: Cassandra's listening IP address. By default, the script would automatically detect the active IP address and use it for Cassandra. When the default setting does not work, or you have multiple IP addresses, you can use this option to specify one. -- `--reader-count=`: The number of reader threads Cassandra uses. According to Cassandra's suggestion, each disk containing the database could have 16 threads to hide its latency. The default value is 16, assuming all the data is stored in on a single disk. -- `--writer-count=`: The number of writer threads Cassandra uses. Cassandra recommends 8 thread per CPU core. The default value is 32. -- `--heap-size=`: The size of JVM heap. Its unit is GB and by default, JVM uses `max(min(1/2 ram, 1GB), min(1/4 ram, 8GB))`. It is good to overload the value when the server has enough DRAM for better performance, or restrict the value for explicit resource restriction. -- `--affinity=`: The CPU Cassandra works on. This setting is useful to explicitly set CPU affinity. Usually, it is combined with container's resource management option (e.g., `--cpuset-cpus`). +The following options can modify the settings of the server: +- `--listen-ip=`: Cassandra's listening IP address. By default, the script will automatically detect and use the active IP address for Cassandra. However, when the default setting does not work or you have multiple IP addresses, you can use this option to specify one. Please make sure this IP address is accessible by the client. +- `--reader-count=`: The number of reader threads Cassandra uses. According to Cassandra's suggestions, each disk containing the database should have 16 threads to hide its latency. The default value is 16, assuming all the data is stored on a single disk. +- `--writer-count=`: The number of writer threads Cassandra uses. Cassandra recommends 8 threads per CPU core. The default value is 32. +- `--heap-size=`: JVM heap size. Its unit is GB, and by default, JVM uses `max(min(1/2 ram, 1GB), min(1/4 ram, 8GB))`. It is good to increase the value when the server has enough DRAM for better performance or lower the value for explicit resource restriction. +- `--affinity=`: The CPUs Cassandra works on. This setting let Cassandra be aware of its CPU affinity explicitly. It should be used together with the container's resource management option (e.g., `--cpuset-cpus`). ### Multiple Server Containers -Please note the server containers cannot be hosted on the same node when the host network configuration is used because they will all try to use the same port. +Please note server containers cannot be hosted on the same node when the host network configuration is used, because they all use the same port. For a cluster setup with multiple servers, we need to instantiate a seed server : @@ -37,62 +34,60 @@ For a cluster setup with multiple servers, we need to instantiate a seed server $ docker run --name cassandra-server-seed --net host cloudsuite/data-serving:server ``` -Then we prepare the server as previously. - The other server containers are instantiated as follows on **different VMs**: ```bash -$ docker run --name cassandra-server(id) --net host cloudsuite/data-serving:server --seed-server-ip= -``` +$ docker run --name cassandra-subserver --net host cloudsuite/data-serving:server --seed-server-ip= +``` -You can find more details at the websites: http://wiki.apache.org/cassandra/GettingStarted and https://hub.docker.com/_/cassandra/. - -Make sure all non-seed servers are established (adding them concurrently may lead to a [problem](https://docs.datastax.com/en/cassandra/2.1/cassandra/operations/ops_add_node_to_cluster_t.html)). +You may find a more detailed tutorial on checking the status and customizing the yaml file [here](https://www.digitalocean.com/community/tutorials/how-to-install-cassandra-and-run-a-multi-node-cluster-on-ubuntu-22-04). ### Client Container -After successfully creating the aforementioned schema, you are ready to benchmark with YCSB. -Start the client container specifying server name(s), or IP address(es), separated with commas, as the last command argument: +Start the client container with bash: ```bash $ docker run -it --name cassandra-client --net host cloudsuite/data-serving:client bash ``` -Before running the measurement, you have to fill the server with the dataset. Use the script `warmup.sh` for a quick setting: +Before running the measurement, you have to fill the server with the dataset. Use the script `warmup.sh`: ```bash $ ./warmup.sh ``` -During warm up period, the script create a table to the seed server, and populate the table with given number of record. Based on the definition(see setup_tables.txt) of the record, the size of each record is 1KB. As a result, a typical 10GB dataset requires 10M records. You can also increase the number of YCSB threads to improve the writing speed, in case the load generator becomes the bottleneck. +During the warm-up period, the script creates a table for the seed server and populates it with a given number of records. Based on the definition (see `setup_tables.txt`) of the record, the size of each record is 1KB. As a result, a typical 10GB dataset requires 10M records. You can also increase the number of YCSB threads to improve the writing speed if the load generator becomes the bottleneck. -After the warm up is finished, you can use `load.sh` to apply load to the server: +After the warm-up is finished, you can use `load.sh` to apply load to the server, with 50% read and 50% update operations: ```bash $ ./load.sh ``` -You can give your expected load and YCSB would try to meet the requirement. In case the server cannot sustain the given load, the reported throughput would be smaller. You can also control the operation count to control the running time. Similar to the warm up stage, you can also increase the YCSB thread count if the load generator is the bottleneck. +You can give your expected load, and YCSB will try to meet the requirement. The reported throughput will be smaller if the server cannot sustain the given load. You can also control the total run time by changing `operation_count`. Like the warm-up stage, you can increase the YCSB thread count if the load generator is the bottleneck. -More detailed instructions on generating the dataset and load can be found in Step 5 at [this](http://github.com/brianfrankcooper/YCSB/wiki/Running-a-Workload) link. Although Step 5 in the link describes the data loading procedure, other steps (e.g., 1, 2, 3, 4) are very useful to understand the YCSB settings. In this case, our scripts (`warmup.sh` and `load.sh`) are good template for further customization. +More detailed instructions on generating the dataset and load can be found in Step 5 at [this](http://github.com/brianfrankcooper/YCSB/wiki/Running-a-Workload) link. Although Step 5 in the link describes the data loading procedure, other steps (e.g., 1, 2, 3, 4) are useful for understanding the YCSB settings. In this case, our scripts (`warmup.sh` and `load.sh`) are good templates for further customization. A rule of thumb on the dataset size ----------------------------------- -To emulate a realistic setup, you can generate more data than your main memory size if you have a low-latency, high-bandwidth I/O subsystem. For example, for a machine with 24GB memory, you can generate 30 million records corresponding to a 30GB dataset size. - -_Note_: The dataset resides in Cassandra’s data folder(s).The actual data takes up more space than the total size of the records because data files have metadata structures (e.g., index). Make sure you have enough disk space. +If you are only profiling CPU microarchitectures, you should ensure that the hot data part (3% ~ 5% of the dataset) cannot be buffered on-chip to mimic a realistic situation. Usually, a 10GB dataset is enough for a typical CPU with less than 50MB LLC. Tuning the server performance ----------------------------- -1. In general, the server settings are under the $CASSANDRA_PATH/conf folder. The main file is cassandra.yaml. The file has comments about all parameters. This parameters can also be found here: http://wiki.apache.org/cassandra/StorageConfiguration -2. You can modify the *target* and *threadcount* variables to tune the benchmark and utilize the server. The throughput depends on the number of hard drives on the server. If there are enough disks, the cores can be utilized after running the benchmark for 10 minutes. Make sure that half of the main memory is free for the operating system file buffers and caching. -3. Additionally, the following links are useful pointers for performance tuning: - - a. http://spyced.blogspot.com/2010/01/linux-performance-basics.html +1. There is no fixed tail latency requirement for this workload. As a reference, the 99 percentile latency should usually be around 5ms to 10ms to not delay its upstream service. +2. The server settings are under the $CASSANDRA_PATH/conf folder. The main file is cassandra.yaml. The file has comments about all parameters. These parameters can also be found here: http://wiki.apache.org/cassandra/StorageConfiguration +3. Make sure that half of the main memory is free for the operating system file buffers and caching. +4. As a workload based on JVM, you need to load the server to warm up the JIT cache. You can keep monitoring the throughput and tail latency and take measurement when it becomes relatively stable. As a reference, it takes around 2 minutes for a modern x86 machine (Skylake) to attain stable throughput (5000 RPS, 50% read and 50% update). +5. The following links are useful pointers for performance tuning: - b. http://wiki.apache.org/cassandra/MemtableThresholds + a. http://spyced.blogspot.com/2010/01/linux-performance-basics.html + b. http://wiki.apache.org/cassandra/MemtableThresholds [dhrepo]: https://hub.docker.com/r/cloudsuite/data-serving/ "DockerHub Page" [dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/data-serving.svg "Go to DockerHub Page" [dhstars]: https://img.shields.io/docker/stars/cloudsuite/data-serving.svg "Go to DockerHub Page" + +[serverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/data-serving/server/Dockerfile "Server Dockerfile" + +[clientdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/data-serving/client/Dockerfile "Client Dockerfile" \ No newline at end of file diff --git a/docs/benchmarks/graph-analytics.md b/docs/benchmarks/graph-analytics.md index c46095a0c..be0ae5b52 100644 --- a/docs/benchmarks/graph-analytics.md +++ b/docs/benchmarks/graph-analytics.md @@ -3,82 +3,82 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -This repository contains the docker image for Cloudsuite's Graph Analytics benchmark. +The Graph Analytics benchmark relies on the Spark framework to perform graph analytics on large-scale datasets. Apache provides a graph processing library, GraphX, designed to run on top of Spark. This benchmark performs various algorithms including PageRank on a Twitter dataset. -The Graph Analytics benchmark relies the Spark framework to perform graph analytics on large-scale datasets. Apache provides a graph processing library, GraphX, designed to run on top of Spark. The benchmark performs PageRank on a Twitter dataset. + +### Dockerfiles + +Supported tags and their respective `Dockerfile` tags: +- [`latest`][latestcontainer] contains the algorithm and application entrypoint. ### Datasets -The benchmark uses a graph dataset generated from Twitter. To get the dataset image: +The benchmark uses a graph dataset generated from Twitter. To create the dataset image: -```sh - $ docker pull cloudsuite/twitter-dataset-graph - $ docker create --name twitter-data cloudsuite/twitter-dataset-graph +```bash +$ docker create --name twitter-data cloudsuite/twitter-dataset-graph ``` -More information about the dataset is available at -[cloudsuite/twitter-dataset-graph][ml-dhrepo]. - ### Running/Tweaking the Benchmark -The benchmark can run three graph algorithms using GraphX through the spark-submit script distributed with Spark. The algorithms are page rank, connected components, and triangle count. +The benchmark can run three graph algorithms using GraphX: PageRank, connected components, and triangle count. To run the benchmark, run the following command: -```sh - $ docker run --rm --volumes-from twitter-data -e WORKLOAD_NAME=pr cloudsuite/graph-analytics \ - --driver-memory 4g --executor-memory 4g +```bash +$ docker run --rm --volumes-from twitter-data -e WORKLOAD_NAME=pr cloudsuite/graph-analytics \ + --driver-memory 8g --executor-memory 8g ``` -Note that any argument passed to the container will be directed to spark-submit. In the given command, to ensure that Spark has enough memory allocated to be able to execute the benchmark in-memory, --driver-memory and --executor-memory arguments are passed to spark-submit. Adjust the spark-submit arguments based on the chosen algorithm and your system and container's configurations. +Note that any arguments passed to the container will be directed to `spark-submit`, the interface to submit jobs to Spark. In the given command, to ensure that Spark has enough memory allocated to execute the benchmark in memory, `--driver-memory` and `--executor-memory` arguments are passed to `spark-submit`. -The environment variable `WORKLOAD_NAME` sets the graph algorithm that the container executes. Use `pr`, `cc`, and `tc` for page rank, connected components, and triangle count, respectively. Page rank is selected by default if no process +The environment variable `WORKLOAD_NAME` sets the graph algorithm that the container executes. Use `pr`, `cc`, and `tc` for PageRank, connected components, and triangle count respectively. PageRank is selected by default if not set. -All these analytics require huge memory to finish when more cores are involved. As ar reference, running `tc` with single CPU core requires both 8GB driver-memory and executor-memory. If you allocate more cores, more memory is necessary. You will see the `OutOfMemoryError` exception if you do not allocate enough memory. +All of these analytic workloads require huge memory to finish. As a reference, running `tc` on a single CPU core requires 8GB each. If you allocate more cores, more memory is necessary. You will see the `OutOfMemoryError` exception if you do not give enough memory. We recommend giving more than 16GB of memory for each core to minimize GC activities, which can influence your measurements. ### Multi-node deployment -This section explains how to run the benchmark using multiple Spark -workers (each running in a Docker container) that can be spread across -multiple nodes in a cluster. For more information on running Spark -with Docker look at [cloudsuite/spark:3.3.2][spark-dhrepo]. +This section explains how to run the benchmark using multiple Spark workers (each running in a Docker container) that can be spread across multiple nodes in a cluster. -First, create a dataset image on every physical node where Spark -workers will be running. +First, create a dataset image on every physical node where Spark workers will be running. -```sh - $ docker create --name twitter-data cloudsuite/twitter-dataset-graph +```bash +$ docker create --name twitter-data cloudsuite/twitter-dataset-graph ``` Start Spark master and Spark workers. You can start the master node with the following command: -```sh - $ docker run -dP --net host --name spark-master \ - cloudsuite/spark:3.3.2 master +```bash +$ docker run -dP --net host --name spark-master \ + cloudsuite/spark:3.3.2 master ``` -By default, the container uses the hostname as the listening IP for the connections to the worker nodes. Make sure all worker machines can access the master machine using the master host name if the listening IP is kept by default. -You can also override the listening address by overriding the environment variable `SPARK_MASTER_IP` by starting container with `-e SPARK_MASTER_IP=X.X.X.X`. +By default, the container uses the hostname as the listening IP for the connections to the worker nodes. Therefore, ensure all worker machines can access the master machine using the master hostname if the listening IP is kept by default. +You can also override the listening address by setting the environment variable `SPARK_MASTER_IP` using the container option `-e SPARK_MASTER_IP=X.X.X.X`. -The workers get access to the datasets with `--volumes-from twitter-data`. +The workers get access to the dataset with `--volumes-from twitter-data`. -```sh - $ docker run -dP --net host --volumes-from twitter-data --name spark-worker-01 \ - cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 - $ docker run -dP --net host --volumes-from twitter-data --name spark-worker-02 \ - cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 - $ ... +```bash +# Set up worker 1 +$ docker run -dP --net host --volumes-from twitter-data --name spark-worker-01 \ + cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 + +# Set up worker 2 +$ docker run -dP --net host --volumes-from twitter-data --name spark-worker-02 \ + cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 + +# ... ``` `SPARK_MASTER` is Spark master's listening address. -Finally, run the benchmark as the client to the Spark master: +Finally, run the benchmark as the client to activate the Spark master: -``` - $ docker run --rm --net host --volumes-from twitter-data -e WORKLOAD_NAME=pr \ - cloudsuite/graph-analytics \ - --driver-memory 4g --executor-memory 4g \ - --master spark://SPARK-MASTER:7077 +```bash +$ docker run --rm --net host --volumes-from twitter-data -e WORKLOAD_NAME=pr \ + cloudsuite/graph-analytics \ + --driver-memory 8g --executor-memory 8g \ + --master spark://SPARK-MASTER:7077 ``` @@ -87,3 +87,5 @@ Finally, run the benchmark as the client to the Spark master: [dhstars]: https://img.shields.io/docker/stars/cloudsuite/graph-analytics.svg "Go to DockerHub Page" [ml-dhrepo]: https://hub.docker.com/r/cloudsuite/twitter-dataset-graph/ [spark-dhrepo]: https://hub.docker.com/r/cloudsuite/spark/ + +[latestcontainer]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/graph-analytics/latest/Dockerfile "link to container, github" diff --git a/docs/benchmarks/in-memory-analytics.md b/docs/benchmarks/in-memory-analytics.md index abf4cef8f..ff0eb02a6 100644 --- a/docs/benchmarks/in-memory-analytics.md +++ b/docs/benchmarks/in-memory-analytics.md @@ -3,28 +3,21 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -This benchmark uses Apache Spark and runs a collaborative filtering algorithm -in-memory on a dataset of user-movie ratings. The metric of interest is the -time in seconds of computing movie recommendations. - -The explosion of accessible human-generated information necessitates automated -analytical processing to cluster, classify, and filter this information. -Recommender systems are a subclass of information filtering system that seek to -predict the 'rating' or 'preference' that a user would give to an item. -Recommender systems have become extremely common in recent years, and are -applied in a variety of applications. The most popular ones are movies, music, -news, books, research articles, search queries, social tags, and products in -general. Because these applications suffer from I/O operations, nowadays, most -of them are running in memory. This benchmark runs the alternating least -squares (ALS) algorithm which is provided by Spark MLlib. +The explosion of human-generated information necessitates automated analytical processing to cluster, classify, and filter this information. Recommender systems are a subclass of information filtering systems that seek to predict the 'rating' or 'preference' that a user would give to an item. They have become extremely popular in recent years and are applied in various applications like movies, music, news, books, research articles, search queries, and social tags. Because these applications suffer from I/O operations, most are running in memory nowadays to provide realtime results. + +This benchmark uses Apache Spark and runs a collaborative filtering algorithm (alternating least squares, ALS) provided by Spark MLlib in memory on a dataset of user-movie ratings. The metric of interest is the time in seconds for computing movie recommendations. + +### Dockerfiles + +Supported tags and their respective `Dockerfile` tags: +- [`latest`][latestcontainer] contains the algorithm and application entrypoint. ### Datasets -The benchmark uses user-movie ratings datasets provided by Movielens. To get -the dataset image: +The benchmark uses user-movie ratings datasets provided by Movielens. Run the following command to create the dataset container: -```sh - $ docker pull cloudsuite/movielens-dataset +```bash +$ docker create --name movielens-data cloudsuite/movielens-dataset ``` More information about the dataset is available at @@ -32,80 +25,70 @@ More information about the dataset is available at ### Running the Benchmark -The benchmark runs the ALS algorithm on Spark through the spark-submit script -distributed with Spark. It takes two arguments: the dataset to use for -training, and the personal ratings file to give recommendations for. Any -remaining arguments are passed to spark-submit. +The benchmark runs the ALS algorithm on Spark through the spark-submit script distributed with Spark. It takes two arguments: the dataset for training and the personal ratings file for recommendations. We provide two training datasets (one small and one large) and a sample personal ratings file. -The cloudsuite/movielens-dataset image has two datasets (one small and one -large), and a sample personal ratings file. +To run the benchmark with the small dataset and the provided personal ratings file: -To run a benchmark with the small dataset and the provided personal ratings -file: - -```sh - $ docker create --name movielens-data cloudsuite/movielens-dataset - $ docker run --volumes-from movielens-data cloudsuite/in-memory-analytics \ - /data/ml-latest-small /data/myratings.csv +```bash +$ docker run --volumes-from movielens-data cloudsuite/in-memory-analytics \ + /data/ml-latest-small /data/myratings.csv ``` +Any additional arguments are passed to `spark-submit`, which is the interface to submit jobs to Spark. + ### Tweaking the Benchmark -Any arguments after the two mandatory ones are passed to spark-submit and can -be used to tweak execution. For example, to ensure that Spark has enough memory -allocated to be able to execute the benchmark in-memory, supply it with ---driver-memory and --executor-memory arguments: +Any arguments after the two mandatory ones are passed to `spark-submit` and are used to tweak execution parameters. For example, to ensure that Spark has enough memory allocated to be able to execute the benchmark in memory, supply it with `--driver-memory` and `--executor-memory` arguments: -```sh - $ docker run --volumes-from movielens-data cloudsuite/in-memory-analytics \ - /data/ml-latest /data/myratings.csv \ - --driver-memory 4g --executor-memory 4g +```bash +$ docker run --volumes-from movielens-data cloudsuite/in-memory-analytics \ + /data/ml-latest /data/myratings.csv \ + --driver-memory 4g --executor-memory 4g ``` ### Multi-node deployment -This section explains how to run the benchmark using multiple Spark workers -(each running in a Docker container) that can be spread across multiple nodes -in a cluster. For more information on running Spark with Docker look at -[cloudsuite/spark:3.3.2][spark-dhrepo]. +This section explains how to run the benchmark using multiple Spark workers (each running in a Docker container) that can be spread across multiple nodes in a cluster. -First, create a dataset image on every physical node where Spark workers will -be running. +First, create a dataset image on every physical node where Spark workers will be running. -```sh - $ docker create --name movielens-data cloudsuite/movielens-dataset +```bash +$ docker create --name movielens-data cloudsuite/movielens-dataset ``` Start Spark master and Spark workers. You can start the master node with the following command: -```sh - $ docker run -dP --net host --name spark-master \ - cloudsuite/spark:3.3.2 master +```bash +$ docker run -dP --net host --name spark-master \ + cloudsuite/spark:3.3.2 master ``` -By default, the container uses the hostname as the listening IP for the connections to the worker nodes. Make sure all worker machines can access the master machine using the master host name if the listening IP is kept by default. -You can also override the listening address by overriding the environment variable `SPARK_MASTER_IP` by starting container with `-e SPARK_MASTER_IP=X.X.X.X`. +By default, the container uses the hostname as the listening IP for the connections to the worker nodes. Ensure all worker machines can access the master machine using the master hostname if the listening IP is kept by default. You can also override the listening address by setting the environment variable `SPARK_MASTER_IP` with the container option `-e SPARK_MASTER_IP=X.X.X.X`. The workers get access to the datasets with `--volumes-from movielens-data`. -``` - $ docker run -dP --net host --volumes-from movielens-data --name spark-worker-01 \ - cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 - $ docker run -dP --net host --volumes-from movielens-data --name spark-worker-02 \ - cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 - $ ... +```bash +# Set up worker1 +$ docker run -dP --net host --volumes-from movielens-data --name spark-worker-01 \ + cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 + +# Setup worker 2 +$ docker run -dP --net host --volumes-from movielens-data --name spark-worker-02 \ + cloudsuite/spark:3.3.2 worker spark://SPARK-MASTER:7077 + +# ... ``` `SPARK_MASTER` is Spark master's listening address. -Finally, run the benchmark as the client to the Spark master: +Finally, run the benchmark as the client to activate the Spark master: -``` - $ docker run --rm --net host --volumes-from movielens-data \ - cloudsuite/in-memory-analytics \ - /data/ml-latest-small /data/myratings.csv \ - --driver-memory 4g --executor-memory 4g \ - --master spark://SPARK-MASTER:7077 +```bash +$ docker run --rm --net host --volumes-from movielens-data \ + cloudsuite/in-memory-analytics \ + /data/ml-latest-small /data/myratings.csv \ + --driver-memory 4g --executor-memory 4g \ + --master spark://SPARK-MASTER:7077 ``` [dhrepo]: https://hub.docker.com/r/cloudsuite/in-memory-analytics/ "DockerHub Page" @@ -113,3 +96,5 @@ Finally, run the benchmark as the client to the Spark master: [dhstars]: https://img.shields.io/docker/stars/cloudsuite/in-memory-analytics.svg "Go to DockerHub Page" [ml-dhrepo]: https://hub.docker.com/r/cloudsuite/movielens-dataset/ [spark-dhrepo]: https://hub.docker.com/r/cloudsuite/spark/ + +[latestcontainer]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/in-memory-analytics/latest/Dockerfile "link to container, github" diff --git a/docs/benchmarks/media-streaming.md b/docs/benchmarks/media-streaming.md index 8959824ed..a02e28aa6 100644 --- a/docs/benchmarks/media-streaming.md +++ b/docs/benchmarks/media-streaming.md @@ -3,37 +3,34 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -This benchmark uses the [Nginx][nginx_repo] web server as a streaming server for hosted videos of various lengths and qualities. Based on [videoperf][httperf_repo]'s session generator, the client sends requests for different videos to stress the server. +This benchmark uses the [Nginx][nginx_repo] web server as a streaming server for hosting videos of various lengths and qualities. Based on [videoperf][httperf_repo]'s session generator, the client requests different videos to stress the server. ## Using the Benchmark ## -The benchmark has two tiers: the server and the client. The server runs Nginx, and the client sends requests to stream videos. Each tier has its image, which is identified by its tag. +The benchmark has two tiers: the server and the client. The server runs Nginx, and the client requests videos. Each tier has its own image, identified by its tag. ### Dockerfiles ### Supported tags and their respective `Dockerfile` links: - - [`server`][serverdocker]: This image represents the Nginx streaming server running as a daemon. - - [`client`][clientdocker]: This image represents the `videoperf` client. - - [`dataset`][datasetdocker]: This image provides the video dataset for the streaming server. + - [`server`][serverdocker] contains the Nginx streaming server running as a daemon. + - [`client`][clientdocker] contains the `videoperf` client. + - [`dataset`][datasetdocker] provides the video dataset for the streaming server. -### Running Dataset and Server on Host1 +### Running Dataset and Server -The dataset image has two purposes. First, it generates video files with different resolutions (from 240p to 720p) for the server docker container. -Then, based on the generated videos, it suggests the request lists for the client docker container. +The dataset image has two purposes. First, it generates video files with different resolutions (from 240p to 720p) for the server docker container. Then, based on the generated videos, it suggests request lists for the client docker container. -First, use the following command to pull the dataset image: +Use the following command to run the dataset container: - $ docker pull cloudsuite/media-streaming:dataset - -Then, use the following command to run the dataset container: +```bash +$ docker run --name streaming_dataset cloudsuite/media-streaming:dataset ${DATASET_SIZE} ${SESSION_COUNT} +``` - $ docker run --name streaming_dataset cloudsuite/media-streaming:dataset ${DATASET_SIZE} ${SESSION_COUNT} - -`DATASET_SIZE` in GBs, scales the size of the dataset to the given number. By default, the dataset container generates a collection of 10 videos for each of 240p, 360p, 480p, and 720p resolutions, having around 3.5 GB size altogether. +The parameter `DATASET_SIZE`, in GBs, scales the size of the dataset to the given number. By default, the dataset container generates ten videos for each of 240p, 360p, 480p, and 720p resolutions, resulting in 3.5 GB size in total. -`SESSION_COUNT` denotes the number of sessions to stream the video files. For every resolution, the dataset container generates a list of sessions (named `session lists`) to guide the client on how to stress the server. By default, the value is five. +The parameter `SESSION_COUNT` denotes the number of sessions for requesting video files. For each resolution, the dataset container generates a list of sessions (named `session lists`) to guide the client on how to stress the server. By default, the value is five. -In `videoperf`'s context, a `session` is a sequence of HTTP/HTTPS requests to fetch a specific video chunk. As a reference, you can find a sample session below: +In `videoperf`'s context, a `session` is a sequence of HTTP/HTTPS queries to fetch a specific video chunk. As a reference, you can find a sample session below: ``` /full-240p-00004.mp4 timeout=10 headers='Range: bytes=0-524287' @@ -44,59 +41,57 @@ In `videoperf`'s context, a `session` is a sequence of HTTP/HTTPS requests to fe /full-240p-00004.mp4 pace_time=10 timeout=10 headers='Range: bytes=2621440-3145727' /full-240p-00004.mp4 pace_time=10 timeout=10 headers='Range: bytes=3145728-3221180' ``` -Each line here defines an HTTP/HTTPS request with the following fields: -- The name of the requested video. In our example, requests ask for `/full-240p-00004.mp4`. -- `timeout` determines the maximum time the client waits before receiving any response after sending the request to the server. Once expired, the client closes the connections and increases the timeout counter. -- `pace_time` determines the latency between sending two consecutive requests. By default, the first three requests of each session don't have this field, which means they are sent together. -- `header` declares a range of video bytes for each request. - -It is possible that sessions in the `session lists` don't touch the whole dataset. In this case, consider increasing the `SESSION_COUNT`. For each resolution, check the beginning of the `session lists` to see the related statistics. - -### Starting the Server on Host1 #### -First, pull the server image with the following command: - - $ docker pull cloudsuite/media-streaming:server +Each line here defines an HTTP/HTTPS query with the following fields: +- The name of the requested video. In our example, all queries ask for `/full-240p-00004.mp4`. +- `timeout` determines the maximum time the client waits to receive any response after sending a query to the server. Once expired, the client closes the connection and increases the timeout counter. +- `pace_time` determines the latency between sending consecutive queries. By default, the first three queries of each session don't have this field, which means they are sent together. +- `header` declares a range of video bytes for each query. -Then, start the server on the same machine as the dataset container: +It is possible that sessions in the `session lists` don't touch the whole dataset. In this case, consider increasing the `SESSION_COUNT`. Check the beginning of the `session lists` for each resolution to see the related statistics. - $ docker run -d --name streaming_server --volumes-from streaming_dataset --net host cloudsuite/media-streaming:server ${NGINX_WORKERS} +### Starting the Server #### +Start the server on the same machine as the dataset container: -The `NGINX_WORKERS` parameter sets the number of Nginx workers. If not given, the default value is 2000. Adjust this number based on the server's computational resources and the intended load. - -### Starting the Client on Host2 ### +```bash +$ docker run -d --name streaming_server --volumes-from streaming_dataset --net host cloudsuite/media-streaming:server ${NGINX_WORKERS} +``` -You need to copy the `session lists` from the dataset container and then transfer the files to Host2, where you want to launch the `videoperf` client. +The `NGINX_WORKERS` parameter sets the number of Nginx workers. If not given, the default value is 2000. Adjust this number based on the server's computational resources and the intended load. -To copy `session lists` from the dataset container to Host 1, use the following command: +### Starting the Client ### - $ docker cp streaming_dataset:/videos/logs +You need to copy the `session lists` from the dataset container and then transfer them to the server where you want to launch the `videoperf` client. -Then, you can use any command (e.g., `scp`, `rsync`) to transfer files to Host2. +To copy `session lists` from the dataset container to server, use the following command: -To start the client container, you must pull the client image and run it. To pull the client image, use the following command: +```bash +$ docker cp streaming_dataset:/videos/logs +``` - $ docker pull cloudsuite/media-streaming:client +Then, you can use any command (e.g., `scp`, `rsync`) to transfer files to the client machine. To run the client container, use the following command: - $ docker run -t --name=streaming_client -v :/videos/logs -v :/output --net host cloudsuite/media-streaming:client ${SERVER_IP} ${VIDEOPERF_PROCESSES} ${CLIENTS} ${RATE} ${ENCRYPTION_MODE} +```bash +$ docker run -t --name=streaming_client -v :/videos/logs -v :/output --net host cloudsuite/media-streaming:client ${SERVER_IP} ${VIDEOPERF_PROCESSES} ${VIDEO_COUNT} ${RATE} ${ENCRYPTION_MODE} +``` Parameters are: - ``: The path where the `session lists` is put. You should be able to find files like `cl-*.log` - ``: The path for the benchmark statistics files. -- `SERVER_IP`: The IP address of the server, which should be the Host1 in this document. +- `SERVER_IP`: The IP address of the server. - `VIDEOPERF_PROCESSES`: The number of videoperf processes, with a default value of 4. -- `CLIENTS`: The total number of clients. Each client will pick one session from the `session list` and send the corresponding requests. -- `RATE`: The rate (client per second) for client generation. +- `VIDEO_COUNT`: The total number of videos to request. Each video is represented by one session from the `session list`, and the client requests the video by sending HTTP queries to get video chunks sequentially. +- `RATE`: The rate (videos per second) for new video request generation. - `ENCRYPTION_MODE`: Whether the transfer is encrypted or not. Possible values are "PT", which stands for plain text; and "TLS", which enables TLS v1.3. -#### Note for Client Generation +#### Note for Video Request Generation -Clients are distributed equally among different videoperf processes to balance the load. For example, if you have 1000 clients and 5 videoperf processes, each process will handle 200 clients. Then, each videoperf process further distributes its clients among different resolutions by 10%, 30%, 40%, and 20% probabilities for 240p, 360p, 480p, and 720p, respectively. You might want to change these probabilities based on your need by modifying [this](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/media-streaming/client/files/run/peak_hunter/launch_remote.sh) script. +Video requests are distributed equally among different videoperf processes to balance the load. For example, if you have 1000 videos to request and 5 videoperf processes, each process will handle 200 videos. Then, each videoperf process further distributes its videos among different resolutions by 10%, 30%, 40%, and 20% probabilities for 240p, 360p, 480p, and 720p, respectively. You might want to change these probabilities based on your need by modifying [this](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/media-streaming/client/files/run/peak_hunter/launch_remote.sh) script. -Suppose the number of clients for a specific resolution becomes larger than the number of sessions available in its `session list`. Then, the client container starts from the beginning of the `session list` to create further clients. +If the number of required videos for a specific resolution is larger than the number of sessions in its `session list`, the client container reuses the list and starts from the beginning of the `session list`. -Some load generators implement a thread pool and assign each client to a thread. On the contrary, videoperf is a single-thread process. Its programming model is based on scheduling timers, which call the corresponding function once expired. For example, there is a [periodic timer](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/media-streaming/client/files/videoperf/gen/rate.c#L132) that is set based on the `RATE` parameter, and is used to generate new clients. +Some load generators implement a thread pool and assign each video request to a thread. On the contrary, videoperf is a single-thread process. Its programming model is based on scheduling timers, which call the corresponding function once it expires. For example, there is a [periodic timer](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/media-streaming/client/files/videoperf/gen/rate.c#L132) that is set based on the `RATE` parameter and is used to generate new video requests. Videoperf represents an open-loop load generator that sends subsequent requests independent of the server's responses to the previous requests. @@ -105,23 +100,24 @@ Videoperf represents an open-loop load generator that sends subsequent requests After running the benchmark, the client container periodically reports three metrics: - Throughput in Mbps - The total number of errors encountered during the benchmark's execution -- The number of concurrent established clients to the server +- The number of concurrent established requests to the server +- The reply rate, which is the number of HTTP requests finished per second A sample report looks like this: ``` -Throughput (Mbps) = 1325.97 , total_errors = 0 , concurrent-clients = 317 +Throughput (Mbps) = 465.59 , total-errors = 0 , concurrent-clients = 161 , reply-rate = 17.6 ``` Note that each videoperf process reports its statistics. Therefore, the overall state of the benchmark will be the sum of individual reports. -To tune the benchmark, give a starting rate as the seed (we suggest 1). The benchmark would take a few minutes to reach a steady state. Consequently, consider giving an appropriate number for `CLIENTS`. For example, if the benchmark did not reach the steady state in 5 minutes and the `RATE` was ten clients per second, the number of clients would be larger than 5x60x10=3000. Therefore, we suggest giving a large value to `CLIENTS` to sustain a long steady state. +To tune the benchmark, give a starting rate as the seed (we suggest 1). The benchmark will take a few minutes to reach a steady state. Consequently, consider giving an appropriate number for `VIDEO_COUNT`. For example, if the benchmark did not reach steady state in 5 minutes and the `RATE` was ten video requests per second, the number of requested videos need to be larger than 5x60x10=3000. Therefore, we suggest giving a large value to `VIDEO_COUNT` to sustain a long steady state. -Other principles are: -1. The benchmark reaches the steady state when both throughput and concurrent clients are stable while there are few encountered errors. The number of errors would be 0, but occasional errors may occur. +Other points to consider are: +1. The benchmark reaches a steady state when both throughput and the number of concurrent video requests are stable while there are few encountered errors. The number of errors should be 0, but occasional errors may occur. 2. If there is a problem in the tuning process, the number of errors will increase rapidly. -3. In the ramp-up phase, both throughput and concurrent clients will be increasing. The throughput may be stable, but concurrent clients continue to increase. It means that the rate of establishing new clients is higher than the server's capabilities. In this case, consider decreasing the `RATE` parameter of the client container. -4. If you find the benchmark in a steady state, you might want to increase the `RATE` to see whether the server can handle a higher load. -5. An overloaded client container would result in errors and crashes. In this case, consider allocating more cores to support more videoperf processes. You can check the client container's CPU utilization using different tools (e.g., docker stats) and compare it against the number of cores on the client machine or the number of cores devoted to the container by docker (e.g., by --cpuset-cpus option). -6. Remember that videoperf is a highly demanding single-thread process. We recommend that you ensure the number of available cores for the client container is higher than the number of videoperf processes. +3. In the ramp-up phase, both throughput and the number of concurrent video requests will increase. The throughput may become stable, but the number of concurrent video requests can continue to increase. It means that the rate of establishing new video requests is higher than the server's capabilities. In this case, consider decreasing the `RATE` parameter of the client container. +4. If you find the benchmark is in a steady state, you might want to increase the `RATE` parameter to see whether the server can handle a higher load. +5. Remember that videoperf is a highly demanding single-thread process. Therefore, we recommend that you ensure the number of available cores for the client container is higher than or equal to the number of videoperf processes. + [datasetdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/media-streaming/dataset/Dockerfile "Dataset Dockerfile" diff --git a/docs/benchmarks/web-search.md b/docs/benchmarks/web-search.md index ffc68fa0a..4cfd3c89a 100644 --- a/docs/benchmarks/web-search.md +++ b/docs/benchmarks/web-search.md @@ -3,19 +3,17 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -This repository contains the docker image for Cloudsuite's Web Search benchmark. - -The Web Search benchmark relies on the [Apache Solr][apachesolr] search engine framework. The benchmark includes a client machine that simulates real-world clients that send requests to the index nodes. The index nodes contain an index of the text and fields found in a set of crawled websites. +The Web Search benchmark relies on the [Apache Solr][apachesolr] search engine framework. The benchmark includes a client machine that simulates real-world clients that send requests to index nodes. Index nodes contain an index of the text and fields found in crawled websites. ## Using the benchmark ## ### Dockerfiles ### -Supported tags and their respective `Dockerfile` links: -- [`index`][indexdocker] This builds an image that crawls a set of given websites to generate an index. -- [`dataset`][datasetdocker] This downloads an already generated index and mounts it to be used by the server. -- [`server`][serverdocker] This builds an image for the Apache Solr index nodes. You may spawn several nodes. -- [`client`][clientdocker] This builds an image with the client node. The client is used to start the benchmark and query the index nodes. +Supported tags and their respective `Dockerfile` tags: +- [`index`][indexdocker] builds an image that crawls a set of websites to generate an index. +- [`dataset`][datasetdocker] downloads an pre-generated index and mounts it to be used by the server. +- [`server`][serverdocker] contains the Apache Solr index nodes. You may spawn several nodes. +- [`client`][clientdocker] contains the client node based on Faban. The client is used to start the benchmark and query index nodes. These images are automatically built using the mentioned Dockerfiles available [`here`][repo]. @@ -23,32 +21,28 @@ These images are automatically built using the mentioned Dockerfiles available [ The following command downloads and mounts the dataset index: - ```sh + ```bash $ docker run --name web_search_dataset cloudsuite/web-search:dataset ``` -### Starting the server (Index Node) ### - -To start the server, you have to first `pull` the server image and then, run it. To `pull` the server image, use the following command (to be done on the same host as the dataset): +It then downloads the dataset from our website. - ```sh - $ docker pull cloudsuite/web-search:server - ``` +### Starting the server (Index Node) ### -The following command will start the server on port 8983 on the host so that the Apache Solr's web interface can be accessed from the web browser using the host's IP address. More information on Apache Solr's web interface can be found [here][solrui]. The first parameter past to the image indicates the memory allocated for the JAVA process. The pre-generated Solr index occupies 12GB of memory, and therefore we use `14g` to avoid disk access. The second parameter indicates the number of Solr nodes. Because the index is for a single node only, the aforesaid parameter should be `1` always. +The following command will start the server on port 8983 on the host so that Apache Solr's web interface can be accessed from the web browser using the host's IP address. More information on Apache Solr's web interface can be found [here][solrui]. The first parameter passed to the image indicates the heap memory allocated for JAVA processes. The pre-generated Solr index occupies 14GB of memory; therefore, we use `14g` to avoid disk access. The second parameter indicates the number of Solr nodes. Because the index is for a single node only, this parameter should be `1` always. -```sh +```bash $ docker run -it --name server --volumes-from web_search_dataset --net host cloudsuite/web-search:server 14g 1 ``` -At the beginning of the server booting process, the container prints the `server_address` of the index node. This address will be used in the client container to send the requests to the index node. The `server_address` message in the container should look like this (note that the IP address might change): +At the beginning of the server booting process, the container prints the `server_address` of the index node. This address is used by the client container to send requests to the index node. The `server_address` message in the container should look like this (note that the IP address might change): -```sh +``` Index Node IP Address: 192.168.1.47 ``` -The server's boot process might take some time. To see whether the index node is up and responsive, you might want to send a simple query using command `query.sh` provided [here](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-search/server/files/query.sh). If the server is up, you will see the following result. -``` +The server's boot process might take some time. To see whether the index node is up and responsive, you can send a simple query using script `query.sh` provided [here](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-search/server/files/query.sh). If the server is up, you will see the following result. +```bash $ ./query.sh `server_address` 200 { @@ -78,39 +72,27 @@ While the benchmark works fine with this warning, to fix it and benefit from hug ### Starting the client and running the benchmark ### -To start a client you have to first `pull` the client image and then run it. To `pull` the client image, use the following command: - -```sh -$ docker pull cloudsuite/web-search:client -``` - -To run the benchmark, start the client container by running the command below: +To load the server, start the client container by running the command below: -```sh +```bash $ docker run -it --name web_search_client --net host cloudsuite/web-search:client ``` `server_address` is the IP address of the Solr index server, and `scale` defines the number of load generators' workers. Additionally, you can customize the load generator and request distribution by applying the following options: -- `--ramp-up=`: The ramp-up time, which is the time when the load generator sends request to warm up the server, before the actual measurement starts. The unit is seconds, and its default value is 20. -- `--ramp-down=`: The ramp-down time. Similar to the ramp-up time, the ramp-down time defines the duration after measurement when the load generator continues. The unit is seconds, and its default value is 10. +- `--ramp-up=`: The ramp-up time, during which the load generator sends requests to warm up the server before the actual measurement starts. The unit is seconds, and its default value is 20. +- `--ramp-down=`: The ramp-down time. Like the ramp-up time, the ramp-down time defines the duration after measurement when the load generator continues. Ramp-down time can avoid a sudden load drop to the server and mimic a real-world scenario. Loads generated in this period are not included in the final statistics. The unit is seconds, and its default value is 10. - `--steady=`: The measurement time. The unit is seconds, and its default value is 60. -- `--interval-type=`: The method to define the interval for each load generator. `ThinkTime` defines the interval as the duration between the receiving reply and the sending the next request, which `CycleTime` defines the interval as the duration between sending two continuous requests. The default value is `ThinkTime`. Note using `CycleTime` will not change anything if the interval is smaller than single requests' latency, thus the load generator is closed. -- `--interval-distribution=`: The distribution of the interval. Its default value is `Fixed` -- `--interval-min=int`: The minimal interval to send requests. The unit is milliseconds and its default value is 1000. -- `--interval-max=int`: The maximal interval of sending requests. The unit is in milliseconds and its default value is 1500. When using the `Fixed` distribution, this value should be identical with the minimal interval. -- `--interval-deviation=float`: The deviation of the interval. The unit is percentage, and its default value is 0. +- `--interval-type=`: The method used to define the inter-request interval for each load generator. `ThinkTime` defines the interval as the duration between receiving a reply and sending the next request, while `CycleTime` defines the interval as the duration between sending consecutive requests. The default value is `ThinkTime`. Note that the load generator cannot follow the given interval if its value is smaller than a single request's latency: the load generator is closed-loop. +- `--interval-distribution=`: The distribution of the intervals. Its default value is `Fixed`. +- `--interval-min=int`: The minimum interval between sending consecutive requests. The unit is milliseconds, and its default value is 1000. +- `--interval-max=int`: The maximum interval between sending consecutive requests. The unit is in milliseconds, and its default value is 1500. When using the `Fixed` distribution, this value should be identical to the minimum interval. +- `--interval-deviation=float`: The deviation of intervals. The unit is a percentage, and its default value is 0. ### Generating a custom index -You can use the index image to generate your own customized index. To start generating an index, first `pull` the index image by running the following command: - -```sh -$ docker pull cloudsuite/web-search:index -``` - -Then, create a list of websites that you want to crawl in a file named `seed.txt`. Write each URL in a different line. Then, run the index container using the command below: +You can use the index image to generate your customized index. To start generating an index, create a list of websites that you want to crawl in a file named `seed.txt`. Write each URL in a different line. Then, run the index container using the command below: -```sh +```bash $ docker run -dt --name web_search_index -v ${PATH_TO_SEED.TXT}:/usr/src/apache-nutch-1.18/urls/seed.txt cloudsuite/web-search:index ``` @@ -118,21 +100,21 @@ This command will run Nutch and Solr on the container and override the given set To start the indexing process, run the command below: -```sh +```bash $ docker exec -it web_search_index generate_index ``` -This command crawls up to 100 web pages, starting from the seed URLs, and generates an index for the crawled pages. Finally, it reports the total number of indexed documents. You can continuously run this command until the number of crawled pages or the size of the index reaches your desired value. The index is located at `/usr/src/solr-9.0.0/nutch/data` in the index container. You can copy the index from the index container to the host machine by running the following command: +This command crawls up to 100 web pages, starting from the seed URLs, and generates an index for the crawled pages. Finally, it reports the total number of indexed documents. You can continuously run this command until the number of crawled pages or the index size reaches your desired value. The index is in the index container at `/usr/src/solr-9.1.1/nutch/data`. You can copy the index from the index container to the host machine by running the following command: -```sh -$ docker cp web_search_index:/usr/src/solr-9.0.0/nutch/data ${PATH_TO_SAVE_INDEX} +```bash +$ docker cp web_search_index:/usr/src/solr-9.1.1/nutch/data ${PATH_TO_SAVE_INDEX} ``` Accordingly, you can modify the server image to use your index instead of the index given by the dataset container. ### Important remarks ### -- The target response time requires that 99% of the requests are serviced within 200ms. +- The target response time requires that 99% of the requests be serviced within 200ms. - The throughput statistic, operations per second, is shown as: @@ -140,7 +122,7 @@ Accordingly, you can modify the server image to use your index instead of the in 25.133 ``` -- The response time statistics, average, maximum, standard deviation, 90-th, and 99-th percentiles, are shown as: +- The response time statistics: average, maximum, standard deviation, 90, and 99 percentiles, are shown as: ```xml @@ -157,15 +139,17 @@ Accordingly, you can modify the server image to use your index instead of the in ### Additional Information ### -- This repository contains a 12GB index for a single node. The index was generated by crawling a set of websites with [Apache Nutch][apachenutch]. It's possible to generate indexes for Apache Solr that are both larger and for multiple index nodes. More information on how to generate indexes can be found [here][nutchtutorial]. - -- The commands to add multiple index nodes are almost identical to the commands executed in the server image. An index has to be copied to Apache Solr's core folder, and then the server is started. The only difference is that the new server nodes have to know the address and the port of the first index node. In our example, it should be `server_address` and `8983`. Note that we also need to use a different port for the servers, for example, `9983`. +- This repository contains a 14GB index for a single node. The index was generated by crawling a set of websites with [Apache Nutch][apachenutch]. It is possible to generate indexes for Apache Solr that are both larger and for multiple index nodes. More information on how to generate indexes can be found [here][nutchtutorial]. +- The commands to add multiple index nodes are almost identical to the commands executed for the server image. An index has to be copied to Apache Solr's core folder, and then the server can be started. The only difference is that the new server nodes have to know the address and the port of the first index node. In our example, it should be `server_address` and `8983`. You have to run the container with `bash` as the entrypoint, modify the last line in `docker-entrypoint.sh` manually by adding `-z` options to specify the first index, and run it. For instance: -```sh -$ bin/solr start -cloud -p 9983 -z server_address:8983 -s /usr/src/solr_cores/ -m 12g +```bash +$ $SOLR_HOME/bin/solr start -force -cloud -f -p $SOLR_PORT -s $SOLR_CORE_DIR -m $SERVER_HEAP_SIZE -z server_address:8983 ``` -- The client container uses a list of prepared terms to generate the queries. You can find the list of the terms that are indexed in the Solr index along with their frequency of appearance in different urls by running the following query: + +if two index servers are on the same machine, you may also need to give a different listening port to the second index server. + +- The client container uses a list of prepared terms to generate queries. You can find the list of the terms that are indexed in the Solr index, along with their frequency of appearance in different URLs by running the following query: ``` http://${SERVER_ADDRESS}:8983/solr/cloudsuite_web_search/terms?terms.fl=text&wt=xml&terms=true&terms.limit=10000 diff --git a/docs/benchmarks/web-serving.md b/docs/benchmarks/web-serving.md index 8e5fddf87..aa0bd1840 100644 --- a/docs/benchmarks/web-serving.md +++ b/docs/benchmarks/web-serving.md @@ -3,19 +3,19 @@ [![Pulls on DockerHub][dhpulls]][dhrepo] [![Stars on DockerHub][dhstars]][dhrepo] -Web Serving is a primary service in the cloud. Traditional web services with dynamic and static content are moved into the cloud to provide fault-tolerance and dynamic scalability by bringing up the needed number of servers behind a load balancer. Although cloud services use many variants of the traditional web stack (e.g., substituting Apache with other web server software or using other language interpreters in place of PHP), the underlying service architecture remains unchanged. A stateless web server process accepts independent client requests. In response, the web server either directly serves static files from disk or passes the request to a stateless middleware script, written in a high-level interpreted or byte-code compiled language, which is then responsible for producing dynamic content. The middleware stores all the state information in backend databases such as cloud NoSQL data stores or traditional relational SQL servers supported by key-value cache servers to achieve high throughput and low latency. This benchmark includes a social networking engine (Elgg), and a client implemented using the Faban workload generator. This benchmark version is compatible with Php 8.1, which supports JIT execution and further improves the workload's throughput. +Web Serving is a primary service in the cloud. Traditional web services with dynamic and static content are moved into the cloud to provide fault tolerance and dynamic scalability by bringing up the required number of servers behind a load balancer. Although cloud services use many variants of the traditional web stack (e.g., substituting Apache with other web server software or using other language interpreters instead of PHP), the underlying service architecture remains unchanged. A stateless web server process accepts independent client requests. In response, the web server either directly serves static files from disk or passes the request to a stateless middleware script, written in a high-level interpreted or byte-code compiled language, which is then responsible for producing dynamic content. The middleware stores all the state information in backend databases such as cloud NoSQL data stores or traditional relational SQL servers supported by key-value cache servers to achieve high throughput and low latency. This benchmark includes a social networking engine (Elgg), and a client implemented using the Faban workload generator. This benchmark version is compatible with Php 8.1, which supports JIT execution and thus improves the workload's throughput. ## Using the benchmark ## -The benchmark has four tiers: the web server, the database server, the Memcached server, and the clients. The web server runs Elgg and connects to the Memcached and the database servers. The clients request to log in to the social network and perform various operations, including sending and reading messages, adding or removing friends, posting blogs, and adding comments or likes to different posts. Each tier has its image, which is identified by its tag. +The benchmark has four tiers: web server, database server, Memcached server, and clients. The web server runs Elgg and connects to the Memcached and database servers. The clients request to log in to the social network and perform various operations, such as sending and reading messages, adding or removing friends, posting blogs, and adding comments or likes to different posts. Each tier has its own image, identified by its tag. ### Dockerfiles ### Supported tags and their respective `Dockerfile` links: - - [`web_server`][webserverdocker] - - [`memcached_server`][memcacheserverdocker] - - [`db_server`][databaseserverdocker] - - [`faban_client`][clientdocker] + - [`web_server`][webserverdocker] contains the application code and the web frontend PHP engine. + - [`memcached_server`][memcacheserverdocker] contains the Memcached server as the caching layer. + - [`db_server`][databaseserverdocker] contains the MariaDB database. + - [`faban_client`][clientdocker] contains the load generator based on Faban. These images are automatically built using the mentioned Dockerfiles available on the `parsa-epfl/cloudsuite` [GitHub repo][repo]. @@ -23,83 +23,78 @@ These images are automatically built using the mentioned Dockerfiles available o Please note that all IP addresses should refer to the explicit IP address of the host server running each container. ### Starting the database server #### -To start the database server, you must first pull the server image using the following command: - - $ docker pull cloudsuite/web-serving:db_server - The following command will start the database server: - $ docker run -dt --net=host --name=database_server cloudsuite/web-serving:db_server +```bash +$ docker run -it --net=host --name=database_server cloudsuite/web-serving:db_server +``` -The benchmark starts with a pre-populated database that stores around 100 K users and their data, such as their friends' list and sent messages. The size of the database is around 2.5 GB. Based on your need, you can extend the database size by running the benchmark for some time and inspecting whether the size of the database has reached your desired number. +The database is pre-populated with around 100 K users and their data, such as their friends list and sent messages. The initial size of the database is around 2.5 GB. Based on your need, you can increase the database size by running the benchmark for some time and inspecting whether the database size has reached your desired number. ### Starting the Memcached server #### -To start the Memcached server, you must first pull the server image. To pull the server image, use the following command: - - $ docker pull cloudsuite/web-serving:memcached_server - The following command will start the Memcached server: - $ docker run -dt --net=host --name=memcache_server cloudsuite/web-serving:memcached_server +```bash +$ docker run -dt --net=host --name=memcache_server cloudsuite/web-serving:memcached_server +``` -### Starting the web server #### -To start the web server, you first have to `pull` the server image by running the following command: +By default, the Memcached server has 4 threads and 64GB buffer size. - $ docker pull cloudsuite/web-serving:web_server +### Starting the web server #### To run the web server, use the following command: - $ docker run -dt --net=host --name=web_server cloudsuite/web-serving:web_server /etc/bootstrap.sh ${PROTOCOL} ${WEB_SERVER_IP} ${DATABASE_SERVER_IP} ${MEMCACHED_SERVER_IP} ${MAX_PM_CHILDREN} ${WORKER_PROCESS} +```bash +$ docker run -dt --net=host --name=web_server cloudsuite/web-serving:web_server /etc/bootstrap.sh ${PROTOCOL} ${WEB_SERVER_IP} ${DATABASE_SERVER_IP} ${MEMCACHED_SERVER_IP} ${MAX_PM_CHILDREN} ${WORKER_PROCESS} +``` -The `PROTOCOL` parameter can either be `http` or `https` and determines the web server's protocol. The `WEB_SERVER_IP`, `DATABASE_SERVER_IP`, and `MEMCACHED_SERVER_IP` parameters refer to the explicit IP of the server running each server. The `MAX_PM_CHILDREN` sets the pm.max_children in the php-fpm setting. The default value is 4 and we recommend considering 4 per core. `WORKER_PROCESS` sets the number of Nginx worker processes. The default is `auto`. We recommend to have 1 Nginx worker process per 32 cores. +The `PROTOCOL` parameter can either be `http` or `https` and determines the web server's protocol. The `WEB_SERVER_IP`, `DATABASE_SERVER_IP`, and `MEMCACHED_SERVER_IP` parameters refer to the explicit IP of the host server running the corresponding container. The `MAX_PM_CHILDREN` parameter sets the `pm.max_children` in the php-fpm setting, which controls the number of threads in PHP's thread pool. The default value is 4, and we recommend setting 4 per core to avoid frequent thread switching while still hiding I/O latency. The `WORKER_PROCESS` parameter specifies the number of Nginx worker processes. The default is `auto`. We recommend having 1 Nginx worker process per 32 cores. -To check whether the web server is up, you can access Elgg's home page through a web browser by `http://:8080` or `https://:8443` URLs for HTTP and HTTPS web servers, respectively. For example, Elgg's home page is shown in the figure below: +To check whether the web server is up, you can access Elgg's home page through a web browser at `http://:8080` or `https://:8443` URLs for HTTP and HTTPS web servers, respectively. For example, Elgg's home page is shown in the figure below: image -You may see different content based on the latest recorded activities in the database. You can log in as the service administrator using `admin` and `adminadmin` as the username and the password in the `log in` menu. Then, you will have access to the administration dashboard, where you can modify different settings based on your need. +You may see different content based on the latest recorded activities in the database. You can log in as the service administrator using `admin` as the username and `adminadmin` as the password in the `log in` menu. Then, you will have access to the administration dashboard, where you can modify different settings based on your need. -You can find the list of usernames and passwords of regular users in [this](https://github.com/parsa-epfl/cloudsuite/blob/update_web_serving_bigDB/benchmarks/web-serving/faban_client/files/users.list) file. You can log in as a regular user and see various services and features available for an Elgg user. It will help you better understand how the benchmark's client container works. +You can find the list of usernames and passwords of regular users in [this](https://github.com/parsa-epfl/cloudsuite/blob/update_web_serving_bigDB/benchmarks/web-serving/faban_client/files/users.list) file. You can log in as a normal user and see various services and features available for an Elgg user. It will help you better understand how the benchmark's client container works. ### Running the benchmark ### -First, `pull` the client image using the following command: - - $ docker pull cloudsuite/web-serving:faban_client - To start the client container which runs the benchmark, use the following commands: - $ docker run --net=host --name=faban_client cloudsuite/web-serving:faban_client ${WEB_SERVER_IP} ${LOAD_SCALE} +```bash +$ docker run --net=host --name=faban_client cloudsuite/web-serving:faban_client ${WEB_SERVER_IP} ${LOAD_SCALE} +``` -The last command has a mandatory parameter to set the IP of the web server, and an optional parameter to specify the load scale (default is 1). The `LOAD_SCALE` parameter controls the number of users that log in to the web server and request social networking pages. You can scale it up and down as much as you would like, keeping in mind that scaling the number of threads may stress the system. To tune the benchmark, we recommend testing your machines for the maximum possible request throughput while maintaining your target QoS metric (we use 99th percentile latency). CPU utilization is less important than the latency and responsiveness for these benchmarks. +The command has a mandatory parameter `WEB_SERVER_IP` to set the IP of the web server and an optional parameter `LOAD_SCALE` to specify the load scale (default is 1). The `LOAD_SCALE` parameter controls the number of users that log in to the web server and request social networking pages. You can scale it up and down as much as you would like, considering that scaling the number of users may stress the system. To tune the benchmark, we recommend testing your machines for the maximum possible request throughput while maintaining your target Quality of Service (QoS) metric (we use 99 percentile latency). CPU utilization is less important than the latency and responsiveness for these benchmarks. -The client container offers multiple options to control how the benchmark runs. The list of the available options is as follows: +The client container offers multiple options to control how the benchmark runs. The list of available options is as follows: -- `--oper=`: This option has three possible values: `usergen`, `run`, and `usergen&run`. In `usergen`, the benchmark only creates new users and adds them to the database. `LOAD_SCALE` determines the number of generated users. On the other hand, `run` does not generate new users but starts the benchmark by logging in the users to interact with the server by sending various requests. Note that the database container holds ~100 K users. Therefore, you can start running the benchmark without needing to generate new users. Finally, `usergen&run` does what previous operations do together. The default value is `usergen&run`. -- `--ramp-up=<# of seconds>`: the number of seconds the benchmark spends in the ramp-up phase. Remember that there is a one-second distance between the users' log-in requests. Therefore, please set the ramp-up time to a value larger than `LOAD_SCALE` to ensure the number of active users in the steady state matches your given number. The default value is 10. +- `--oper=`: This option has three possible values: `usergen`, `run`, and `usergen&run`. In `usergen`, the benchmark only creates new users and adds them to the database. `LOAD_SCALE` determines the number of generated users. On the other hand, `run` does not generate new users but starts the benchmark by logging in the users to interact with the server by sending various requests. Note that the pre-populated database container holds ~100K users. Therefore, you can start running the benchmark without needing to generate new users. Finally, `usergen&run` does what previous operations do together. The default value is `usergen&run`. +- `--ramp-up=<# of seconds>`: the number of seconds the benchmark spends in the ramp-up phase. Remember that there is a one-second interval between the users' log in requests. Therefore, please set the ramp-up time to a value larger than `LOAD_SCALE` to ensure the number of active users in the steady state matches your given number. The default value is 10. - `--ramp-down=<# of seconds>`: the number of seconds the benchmark spends in the ramp-down phase. The default value is 10. - `--steady=<# of seconds>`: the number of seconds the benchmark spends in the steady-state phase where all users are logged in and interacting with the server. The default value is 30. -- `--min=<# of seconds>`: the minimum number of milliseconds between the consecutive requests of a specific user. The default value is 1000. -- `--max=<# of seconds>`: the maximum number of milliseconds between the consecutive requests of a specific user. The default value is 1500. -- `--type=`: This option has two possible values: `THINKTIME` and `CYCLETIME`. In the former, the latencies determined by `--min` and `--max` to send a new request are measured compared to when the last request finishes. On the other hand, the `CYCLETIME` policy calculates the latencies from the moment the request is sent to the server. The default value is `THINKTIME`. -- `--dist=`: This option controls the distribution of the values chosen for the inter-request latencies between `--min` and `--max`. You can give three possible values to this option: `fixed`, `uniform`, and `negexp` (that refers to the negative exponential distribution). The default value is `fixed`. -- `--encryption=<1 | 0>`: The client container can send the request to a TLS encrypted server by setting this option to 1. The default value is 0, which means the client sends HTTP requests. This value should be adjust according to the `PROTOCOL` parameter you set when setting up the server. +- `--min=<# of seconds>`: the minimum number of milliseconds between consecutive requests of a specific user. The default value is 1000. +- `--max=<# of seconds>`: the maximum number of milliseconds between consecutive requests of a specific user. The default value is 1500. +- `--type=`: This option has two possible values: `THINKTIME` and `CYCLETIME`. For the former value, the latency before sending a new request (determined by `--min` and `--max`) is measured relative to when the last request finishes. On the other hand, the `CYCLETIME` policy calculates the latency from the moment a request is sent to the server. The default value is `THINKTIME`. +- `--dist=`: This option controls the distribution of values chosen for the inter-request latencies between `--min` and `--max`. You can give three possible values to this option: `fixed`, `uniform`, and `negexp` (that refers to negative exponential distribution). The default value is `fixed`. +- `--encryption=<1 | 0>`: The client container can send the request to a TLS-encrypted server by setting this option to 1. The default value is 0, which means the client sends HTTP requests. This value should be adjusted according to the `PROTOCOL` parameter you set when setting up the server. -The client container will output the summary of the benchmark results in XML after the benchmark finishes. You can also access the summary and logs of the run by mounting the `/faban/output` directory of the container in the host filesystem (e.g., `-v /host/path:/faban/output`). +The client container will output the summary of the benchmark results in XML format after the benchmark finishes. You can also access the summary and logs of the run by mounting the `/faban/output` directory of the container in the host filesystem (e.g., `-v /host/path:/faban/output`). ### Possible User Operations Elgg provides different features for the users. The client image implements a subset of the available features. The main driver of the client image is in [this](https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/faban_client/files/web20_benchmark/src/workload/driver/Web20Driver.java.in) file. You can enrich the list of requests that the client driver supports by modifying the mentioned file. The current version supports the following requests: - Browse the home page -- Browse recent activities of the site +- Browse recent activities in the site - Browse a user's profile -- Log in a user +- Log in as a user - Register a new user - Log out a user - Add a friend - Remove a friend -- Browse the friends' list -- Check the notifications +- Browse the friends list +- Check notifications - Post a wire (A wire is similar to a tweet or a status update) - Browse the wires page - Reply to a wire @@ -108,19 +103,19 @@ Elgg provides different features for the users. The client image implements a su - Send a message - Read a message - Delete a message -- Browse the sent messages +- Browse sent messages - Browse the blogs page - Post a blog - Like a blog post - Comment on a blog -- Search a site member +- Search for a site member - [webserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/web_server/Dockerfile "WebServer Dockerfile" - [memcacheserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/memcached_server "MemcacheServer Dockerfile" - [databaseserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/db_server/Dockerfile "DatabaseServer(MariaDB) Dockerfile" - [clientdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/faban_client/Dockerfile "Client Dockerfile" +[webserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/web_server/Dockerfile "WebServer Dockerfile" +[memcacheserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/memcached_server "MemcacheServer Dockerfile" +[databaseserverdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/db_server/Dockerfile "DatabaseServer(MariaDB) Dockerfile" +[clientdocker]: https://github.com/parsa-epfl/cloudsuite/blob/main/benchmarks/web-serving/faban_client/Dockerfile "Client Dockerfile" - [repo]: https://github.com/parsa-epfl/cloudsuite/tree/main/benchmarks/web-serving "GitHub Repo" - [dhrepo]: https://hub.docker.com/r/cloudsuite/web-serving/ "DockerHub Page" - [dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/web-serving.svg "Go to DockerHub Page" - [dhstars]: https://img.shields.io/docker/stars/cloudsuite/web-serving.svg "Go to DockerHub Page" +[repo]: https://github.com/parsa-epfl/cloudsuite/tree/main/benchmarks/web-serving "GitHub Repo" +[dhrepo]: https://hub.docker.com/r/cloudsuite/web-serving/ "DockerHub Page" +[dhpulls]: https://img.shields.io/docker/pulls/cloudsuite/web-serving.svg "Go to DockerHub Page" +[dhstars]: https://img.shields.io/docker/stars/cloudsuite/web-serving.svg "Go to DockerHub Page"