[PPML] Modify the quickstart document of TPC-DS (#4441)
* Change command order * [PPML]Modify the quickstart document of tpc-ds * Remove blank
This commit is contained in:
		
							parent
							
								
									56aee11c82
								
							
						
					
					
						commit
						154f32ce97
					
				
					 1 changed files with 26 additions and 68 deletions
				
			
		| 
						 | 
					@ -30,14 +30,35 @@ sbt "test:runMain com.databricks.spark.sql.perf.tpcds.GenTPCDSData -d <dsdgenDir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Deploy PPML TPC-DS on Kubernetes
 | 
					### Deploy PPML TPC-DS on Kubernetes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
1. Pull docker image
 | 
					1. Compile Kit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					cd zoo-tutorials/tpcds-spark
 | 
				
			||||||
 | 
					sbt package
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. Create external tables
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					$SPARK_HOME/bin/spark-submit \
 | 
				
			||||||
 | 
					        --class "createTables" \
 | 
				
			||||||
 | 
					        --master <spark-master> \
 | 
				
			||||||
 | 
					        --driver-memory 20G \
 | 
				
			||||||
 | 
					        --executor-cores <executor-cores> \
 | 
				
			||||||
 | 
					        --total-executor-cores <total-cores> \
 | 
				
			||||||
 | 
					        --executor-memory 20G \
 | 
				
			||||||
 | 
					        --jars spark-sql-perf/target/scala-2.12/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar \
 | 
				
			||||||
 | 
					        target/scala-2.12/tpcds-benchmark_2.12-0.1.jar <dataDir> <dsdgenDir> <scaleFactor>
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. Pull docker image
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
sudo docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT
 | 
					sudo docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-python-graphene:2.1.0-SNAPSHOT
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
2. Prepare SGX keys, make sure keys and tpcds-spark can be accessed on each K8S node
 | 
					4. Prepare SGX keys, make sure keys and tpcds-spark can be accessed on each K8S node
 | 
				
			||||||
3. Start a bigdl-ppml enabled Spark K8S client container with configured local IP, key, tpc-ds and kuberconfig path
 | 
					5. Start a bigdl-ppml enabled Spark K8S client container with configured local IP, key, tpc-ds and kuberconfig path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
export ENCLAVE_KEY=/root/keys/enclave-key.pem
 | 
					export ENCLAVE_KEY=/root/keys/enclave-key.pem
 | 
				
			||||||
| 
						 | 
					@ -75,13 +96,13 @@ sudo docker run -itd \
 | 
				
			||||||
        $DOCKER_IMAGE bash
 | 
					        $DOCKER_IMAGE bash
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
4. Attach to the client container
 | 
					6. Attach to the client container
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
sudo docker exec -it spark-local-k8s-client bash
 | 
					sudo docker exec -it spark-local-k8s-client bash
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
5. Modify `spark-executor-template.yaml`, add path of `enclave-key`, `tpcds-spark` and `kuberconfig` on host
 | 
					7. Modify `spark-executor-template.yaml`, add path of `enclave-key`, `tpcds-spark` and `kuberconfig` on host
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
apiVersion: v1
 | 
					apiVersion: v1
 | 
				
			||||||
| 
						 | 
					@ -110,69 +131,6 @@ spec:
 | 
				
			||||||
        path: /path/to/kuberconfig
 | 
					        path: /path/to/kuberconfig
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
6. Compile Kit
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
cd zoo-tutorials/tpcds-spark
 | 
					 | 
				
			||||||
sbt package
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
7. Create external tables
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
export TF_MKL_ALLOC_MAX_BYTES=10737418240 && \
 | 
					 | 
				
			||||||
export SPARK_LOCAL_IP=$LOCAL_IP && \
 | 
					 | 
				
			||||||
export HDFS_HOST=$hdfs_host_ip && \
 | 
					 | 
				
			||||||
export HDFS_PORT=$hdfs_port && \
 | 
					 | 
				
			||||||
export TPCDS_DIR=/ppml/trusted-big-data-ml/work/tpcds-spark \
 | 
					 | 
				
			||||||
export INPUT_DIR=$TPCDS_DIR/input \
 | 
					 | 
				
			||||||
export DSDGEN_DIR=tpcds-kit/tools \
 | 
					 | 
				
			||||||
export SCALE_FACTOR=1
 | 
					 | 
				
			||||||
  /opt/jdk8/bin/java \
 | 
					 | 
				
			||||||
    -cp '$TPCDS_DIR/spark-sql-perf/target/scala-2.12/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar:$TPCDS_DIR/target/scala-2.12/tpcds-benchmark_2.12-0.1.jar:$TPCH_DIR/input/*:$DSDGEN_DIR/*:/ppml/trusted-big-data-ml/work/spark-3.1.2/conf/:/ppml/trusted-big-data-ml/work/spark-3.1.2/jars/*' \
 | 
					 | 
				
			||||||
    -Xmx10g \
 | 
					 | 
				
			||||||
    -Dbigdl.mklNumThreads=1 \
 | 
					 | 
				
			||||||
    org.apache.spark.deploy.SparkSubmit \
 | 
					 | 
				
			||||||
    --master $RUNTIME_SPARK_MASTER \
 | 
					 | 
				
			||||||
    --deploy-mode client \
 | 
					 | 
				
			||||||
    --name spark-tpch-sgx \
 | 
					 | 
				
			||||||
    --conf spark.driver.host=$LOCAL_IP \
 | 
					 | 
				
			||||||
    --conf spark.driver.port=54321 \
 | 
					 | 
				
			||||||
    --conf spark.driver.memory=10g \
 | 
					 | 
				
			||||||
    --conf spark.driver.blockManager.port=10026 \
 | 
					 | 
				
			||||||
    --conf spark.blockManager.port=10025 \
 | 
					 | 
				
			||||||
    --conf spark.scheduler.maxRegisteredResourcesWaitingTime=5000000 \
 | 
					 | 
				
			||||||
    --conf spark.worker.timeout=600 \
 | 
					 | 
				
			||||||
    --conf spark.python.use.daemon=false \
 | 
					 | 
				
			||||||
    --conf spark.python.worker.reuse=false \
 | 
					 | 
				
			||||||
    --conf spark.network.timeout=10000000 \
 | 
					 | 
				
			||||||
    --conf spark.starvation.timeout=250000 \
 | 
					 | 
				
			||||||
    --conf spark.rpc.askTimeout=600 \
 | 
					 | 
				
			||||||
    --conf spark.sql.autoBroadcastJoinThreshold=-1 \
 | 
					 | 
				
			||||||
    --conf spark.io.compression.codec=lz4 \
 | 
					 | 
				
			||||||
    --conf spark.sql.shuffle.partitions=8 \
 | 
					 | 
				
			||||||
    --conf spark.speculation=false \
 | 
					 | 
				
			||||||
    --conf spark.executor.heartbeatInterval=10000000 \
 | 
					 | 
				
			||||||
    --conf spark.executor.instances=24 \
 | 
					 | 
				
			||||||
    --executor-cores 8 \
 | 
					 | 
				
			||||||
    --total-executor-cores 192 \
 | 
					 | 
				
			||||||
    --executor-memory 16G \
 | 
					 | 
				
			||||||
    --properties-file /ppml/trusted-big-data-ml/work/bigdl-2.1.0-SNAPSHOT/conf/spark-bigdl.conf \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.authenticate.serviceAccountName=spark \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.container.image=$RUNTIME_K8S_SPARK_IMAGE \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.executor.podTemplateFile=/ppml/trusted-big-data-ml/spark-executor-template.yaml \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.executor.deleteOnTermination=false \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.executor.podNamePrefix=spark-tpcds-sgx \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.sgx.enabled=true \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.sgx.mem=32g \
 | 
					 | 
				
			||||||
    --conf spark.kubernetes.sgx.jvm.mem=10g \
 | 
					 | 
				
			||||||
    --class "createTables" \
 | 
					 | 
				
			||||||
    --verbose \
 | 
					 | 
				
			||||||
    --jars $TPCDS_DIR/spark-sql-perf/target/scala-2.12/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar \
 | 
					 | 
				
			||||||
    $TPCDS_DIR/target/scala-2.12/tpcds-benchmark_2.12-0.1.jar \
 | 
					 | 
				
			||||||
    $INPUT_DIR $DSDGEN_DIR $SCALE_FACTOR
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
8. Execute TPC-DS queries
 | 
					8. Execute TPC-DS queries
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Optional argument `QUERY` is the query number to run. Multiple query numbers should be separated by space, e.g. `1 2 3`. If no query number is specified, all 1-99 queries would be executed.
 | 
					Optional argument `QUERY` is the query number to run. Multiple query numbers should be separated by space, e.g. `1 2 3`. If no query number is specified, all 1-99 queries would be executed.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue