From 5e4c269a4911b554d6969aacac3fa6c0b47eabc1 Mon Sep 17 00:00:00 2001 From: CharleneHu94 <37971369+CharleneHu94@users.noreply.github.com> Date: Fri, 21 Oct 2022 17:01:37 +0800 Subject: [PATCH] [PPML] TPC-DS doc upadte (#6238) * ppml tpcds doc update * fix * update data generation step --- .../QuickStart/tpc-ds_with_sparksql_on_k8s.md | 371 +++++++++--------- 1 file changed, 189 insertions(+), 182 deletions(-) diff --git a/docs/readthedocs/source/doc/PPML/QuickStart/tpc-ds_with_sparksql_on_k8s.md b/docs/readthedocs/source/doc/PPML/QuickStart/tpc-ds_with_sparksql_on_k8s.md index e70f75f2..18bdf659 100644 --- a/docs/readthedocs/source/doc/PPML/QuickStart/tpc-ds_with_sparksql_on_k8s.md +++ b/docs/readthedocs/source/doc/PPML/QuickStart/tpc-ds_with_sparksql_on_k8s.md @@ -8,207 +8,214 @@ ### Prepare TPC-DS kit and data -1. Download and compile tpc-ds +1. Download and compile TPC-DS kit - ```bash - git clone --recursive https://github.com/intel-analytics/zoo-tutorials.git - cd /path/to/zoo-tutorials - git clone https://github.com/databricks/tpcds-kit.git - cd tpcds-kit/tools - make OS=LINUX - ``` +```bash +git clone --recursive https://github.com/intel-analytics/zoo-tutorials.git +cd zoo-tutorials/tpcds-spark +git clone https://github.com/databricks/tpcds-kit.git +cd tpcds-kit/tools +make OS=LINUX +cd ../../ +sbt package +``` 2. Generate data - ```bash - cd /path/to/zoo-tutorials - cd tpcds-spark/spark-sql-perf - sbt "test:runMain com.databricks.spark.sql.perf.tpcds.GenTPCDSData -d -s -l -f parquet" - ``` +```bash +cd /path/to/zoo-tutorials/tpcds-spark/spark-sql-perf +sbt "test:runMain com.databricks.spark.sql.perf.tpcds.GenTPCDSData -d -s -l -f parquet" +``` - `dsdgenDir` is the path of `tpcds-kit/tools`, `scaleFactor` is the size of the data, for example `-s 1` will generate 1G data, `dataDir` is the path to store generated data. +`dsdgenDir` is the path of `tpcds-kit/tools`, `scaleFactor` indicates data size, for example `-s 1` will generate data of 1GB scale factor, `dataDir` is the path to store generated data. ### Deploy PPML TPC-DS on Kubernetes +1. Pull docker image -1. Compile Kit +```bash +sudo docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT +``` - ```bash - cd zoo-tutorials/tpcds-spark - sbt package - ``` +2. Prepare keys, password and k8s configurations (follow instructions [here](https://github.com/intel-analytics/BigDL/tree/main/ppml/trusted-big-data-ml/python/docker-graphene#11-prepare-the-keyspassworddataenclave-keypem "here")), make sure keys, `tpcds-spark` and generated tpc-ds data can be accessed on each K8S node, e.g. deploy on distributed storage inclusing NFS and HDFS. +3. Start a bigdl-ppml enabled Spark K8S client container with configured local IP, key, tpc-ds and kubeconfig path, also configure data path if your data is stored on local FS -2. Create external tables +```bash +export ENCLAVE_KEY=/YOUR_DIR/keys/enclave-key.pem +export TPCDS_PATH=/YOUR_DIR/zoo-tutorials/tpcds-spark +export DATA_PATH=/YOUR_DIR/data +export KEYS_PATH=/YOUR_DIR/keys +export SECURE_PASSWORD_PATH=/YOUR_DIR/password +export KUBECONFIG_PATH=/YOUR_DIR/kubeconfig +export LOCAL_IP=$local_ip +export DOCKER_IMAGE=intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT +sudo docker run -itd \ + --privileged \ + --net=host \ + --name=spark-k8s-client \ + --oom-kill-disable \ + --device=/dev/sgx/enclave \ + --device=/dev/sgx/provision \ + -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \ + -v $ENCLAVE_KEY:/graphene/Pal/src/host/Linux-SGX/signer/enclave-key.pem \ + -v $TPCDS_PATH:/ppml/trusted-big-data-ml/work/tpcds-spark \ + -v $DATA_PATH:/ppml/trusted-big-data-ml/work/data \ + -v $KEYS_PATH:/ppml/trusted-big-data-ml/work/keys \ + -v $SECURE_PASSWORD_PATH:/ppml/trusted-big-data-ml/work/password \ + -v $KUBECONFIG_PATH:/root/.kube/config \ + -e RUNTIME_SPARK_MASTER=k8s://https://$LOCAL_IP:6443 \ + -e RUNTIME_K8S_SERVICE_ACCOUNT=spark \ + -e RUNTIME_K8S_SPARK_IMAGE=$DOCKER_IMAGE \ + -e RUNTIME_DRIVER_HOST=$LOCAL_IP \ + -e RUNTIME_DRIVER_PORT=54321 \ + -e RUNTIME_EXECUTOR_INSTANCES=1 \ + -e RUNTIME_EXECUTOR_CORES=4 \ + -e RUNTIME_EXECUTOR_MEMORY=20g \ + -e RUNTIME_TOTAL_EXECUTOR_CORES=4 \ + -e RUNTIME_DRIVER_CORES=4 \ + -e RUNTIME_DRIVER_MEMORY=10g \ + -e SGX_MEM_SIZE=64G \ + -e SGX_LOG_LEVEL=error \ + -e LOCAL_IP=$LOCAL_IP \ + $DOCKER_IMAGE bash +``` - ```bash - $SPARK_HOME/bin/spark-submit \ - --class "createTables" \ - --master \ - --driver-memory 20G \ - --executor-cores \ - --total-executor-cores \ - --executor-memory 20G \ - --jars spark-sql-perf/target/scala-2.12/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar \ - target/scala-2.12/tpcds-benchmark_2.12-0.1.jar - ``` +4. Attach to the client container -3. Pull docker image +```bash +sudo docker exec -it spark-local-k8s-client bash +``` - ```bash - sudo docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT - ``` +5. Create external tables -4. Prepare SGX keys (following instructions [here](https://github.com/intel-analytics/BigDL/tree/main/ppml/trusted-big-data-ml/python/docker-graphene#11-prepare-the-keyspassworddataenclave-keypem "here")), make sure keys and tpcds-spark can be accessed on each K8S node -5. Start a bigdl-ppml enabled Spark K8S client container with configured local IP, key, tpc-ds and kuberconfig path +```bash +cd /ppml/trusted-big-data-ml/work/tpcds-spark +$SPARK_HOME/bin/spark-submit \ + --class "createTables" \ + --master \ + --driver-memory 20G \ + --executor-cores \ + --total-executor-cores \ + --executor-memory 20G \ + --jars spark-sql-perf/target/scala-2.12/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar \ + target/scala-2.12/tpcds-benchmark_2.12-0.1.jar +``` +`` and `` are the generated data path and `tpcds-kit/tools` path, both should be accessible in the container. After successfully creating tables, there should be a directory `metastore_db` in the current working path. - ```bash - export ENCLAVE_KEY=/YOUR_DIR/keys/enclave-key.pem - export DATA_PATH=/YOUR_DIR/zoo-tutorials/tpcds-spark - export KEYS_PATH=/YOUR_DIR/keys - export SECURE_PASSWORD_PATH=/YOUR_DIR/password - export KUBERCONFIG_PATH=/YOUR_DIR/kuberconfig - export LOCAL_IP=$local_ip - export DOCKER_IMAGE=intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT - sudo docker run -itd \ - --privileged \ - --net=host \ - --name=spark-local-k8s-client \ - --oom-kill-disable \ - --device=/dev/sgx/enclave \ - --device=/dev/sgx/provision \ - -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \ - -v $ENCLAVE_KEY:/graphene/Pal/src/host/Linux-SGX/signer/enclave-key.pem \ - -v $DATA_PATH:/ppml/trusted-big-data-ml/work/tpcds-spark \ - -v $KEYS_PATH:/ppml/trusted-big-data-ml/work/keys \ - -v $SECURE_PASSWORD_PATH:/ppml/trusted-big-data-ml/work/password \ - -v $KUBERCONFIG_PATH:/root/.kube/config \ - -e RUNTIME_SPARK_MASTER=k8s://https://$LOCAL_IP:6443 \ - -e RUNTIME_K8S_SERVICE_ACCOUNT=spark \ - -e RUNTIME_K8S_SPARK_IMAGE=$DOCKER_IMAGE \ - -e RUNTIME_DRIVER_HOST=$LOCAL_IP \ - -e RUNTIME_DRIVER_PORT=54321 \ - -e RUNTIME_EXECUTOR_INSTANCES=1 \ - -e RUNTIME_EXECUTOR_CORES=4 \ - -e RUNTIME_EXECUTOR_MEMORY=20g \ - -e RUNTIME_TOTAL_EXECUTOR_CORES=4 \ - -e RUNTIME_DRIVER_CORES=4 \ - -e RUNTIME_DRIVER_MEMORY=10g \ - -e SGX_MEM_SIZE=64G \ - -e SGX_LOG_LEVEL=error \ - -e LOCAL_IP=$LOCAL_IP \ - $DOCKER_IMAGE bash - ``` +6. Modify `/ppml/trusted-big-data-ml/spark-executor-template.yaml`, add path of `enclave-key`, `tpcds-spark` and `kubeconfig`. If data is not stored on HDFS, also configure mount volume `data` and make sure `mountPath` is the same as `` used in create table step. -6. Attach to the client container - - ```bash - sudo docker exec -it spark-local-k8s-client bash - ``` - -7. Modify `spark-executor-template.yaml`, add path of `enclave-key`, `tpcds-spark` and `kuberconfig` on host - - ```yaml - apiVersion: v1 - kind: Pod - spec: - containers: - - name: spark-executor - securityContext: - privileged: true - volumeMounts: +```yaml +apiVersion: v1 +kind: Pod +spec: + containers: + - name: spark-executor + securityContext: + privileged: true + volumeMounts: + - name: enclave-key + mountPath: /graphene/Pal/src/host/Linux-SGX/signer/enclave-key.pem ... - - name: tpcds - mountPath: /ppml/trusted-big-data-ml/work/tpcds-spark - - name: kubeconf - mountPath: /root/.kube/config - volumes: - - name: enclave-key - hostPath: - path: /root/keys/enclave-key.pem + - name: tpcds + mountPath: /ppml/trusted-big-data-ml/work/tpcds-spark + - name: data + mountPath: /mounted/path/to/data + - name: kubeconf + mountPath: /root/.kube/config + volumes: + - name: enclave-key + hostPath: + path: /path/to/keys/enclave-key.pem ... - - name: tpcds - hostPath: - path: /path/to/tpcds-spark - - name: kubeconf - hostPath: - path: /path/to/kuberconfig - ``` + - name: tpcds + hostPath: + path: /path/to/tpcds-spark + - name: data + hostPath: + path: /path/to/data + - name: kubeconf + hostPath: + path: /path/to/kubeconfig +``` -8. Execute TPC-DS queries +7. Execute TPC-DS queries - Optional argument `QUERY` is the query number to run. Multiple query numbers should be separated by space, e.g. `1 2 3`. If no query number is specified, all 1-99 queries would be executed. +Optional argument `QUERY` is the query number to run. Multiple query numbers should be separated by space, e.g. `1 2 3`. If no query number is specified, all 1-99 queries would be executed. Configure `$hdfs_host_ip` and `$hdfs_port` if the output is stored on HDFS. - ```bash - secure_password=`openssl rsautl -inkey /ppml/trusted-big-data-ml/work/password/key.txt -decrypt /performance` directory. +After benchmark is finished, the performance result is saved as `part-*.csv` file under `/performance` directory.