From ce2f8eace757498edd17f01f45ba3a58da0be536 Mon Sep 17 00:00:00 2001 From: zhangli Date: Tue, 14 Mar 2017 16:37:35 +0800 Subject: [PATCH 1/3] refactor --- README.md | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 00000000..a94e306c --- /dev/null +++ b/README.md @@ -0,0 +1,127 @@ + +BigDL comes with scala API for now. However Python is a powerful programming language for data analysis and with large amount of useful libraries, we are developing a lightweight python binding on top of PySpark which can enable us use Python naively with BigDL. + +The Python API is almost identical to the scala version, and it would map ndarray to tensor for the training samples, so basically user only need to care about how to manipulate ndarray. + +This Python binding tested with Python 2.7 and Spark 1.6.0. + +Here are the steps for training a simple LeNet model: + +1). Create a RDD[Sample]: +``` +RDD[..] --transform-->RDD[ndarray, ndarray].map(Sample.from_ndarray(features, label)) --> RDD[Sample] +``` + +2). Define a model: +``` + def build_model(class_num): + model = Sequential() + model.add(Reshape([1, 28, 28])) + model.add(SpatialConvolution(1, 6, 5, 5)) + model.add(Tanh()) + model.add(SpatialMaxPooling(2, 2, 2, 2)) + model.add(Tanh()) + model.add(SpatialConvolution(6, 12, 5, 5)) + model.add(SpatialMaxPooling(2, 2, 2, 2)) + model.add(Reshape([12 * 4 * 4])) + model.add(Linear(12 * 4 * 4, 100)) + model.add(Tanh()) + model.add(Linear(100, class_num)) + model.add(LogSoftMax()) + return model + ``` + +3). Create Optimizer and train: +``` + optimizer = Optimizer( + model=build_model(10), + training_rdd=train_data, + criterion=ClassNLLCriterion(), + optim_method="SGD", + state=state, + end_trigger=MaxEpoch(100), + batch_size=int(options.batchSize)) + optimizer.setvalidation( + batch_size=32, + val_rdd=test_data, + trigger=EveryEpoch(), + val_method=["top1"] + ) + optimizer.setcheckpoint(EveryEpoch(), "/tmp/lenet5/") + trained_model = optimizer.optimize() +``` + +4) LeNet example can be found from: models/lenet5.py + +## Run python test +* Package Scala code by: ```$BigDL_HOME/make-dist.sh``` +* Set SPARK_HOME and then run: ```$BigDL_HOME/dl/src/main/python/dev/run-all.sh``` + +## Installing on Ubuntu +1. Build BigDL +[Build Page](https://github.com/intel-analytics/BigDL/wiki/Build-Page) + * With Spark1.6: ``` $BIGDL_HOME//make-dist.sh ``` + * With Spark2.0: ``` $BIGDL_HOME//make-dist.sh -P spark_2.0 ``` + +2. Install python dependensies: + * Installing Numpy: + ```sudo apt-get install python-numpy``` + + * Installing Python setuptools: + ```sudo apt-get install -y python-setuptools python-pip``` + + * Install Jupyter: + ```sudo pip install jupyter``` + +## Run a Lenet example on standalone cluster + + ``` + BigDL_HOME=... + SPARK_HOME=... + MASTER=... + PYTHON_API_ZIP_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-python-api.zip + BigDL_JAR_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar + PYTHONPATH=${PYTHON_API_ZIP_PATH}:$PYTHONPATH + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --driver-cores 5 \ + --driver-memory 10g \ + --total-executor-cores 80 \ + --executor-cores 10 \ + --executor-memory 20g \ + --conf spark.akka.frameSize=64 \ + --py-files ${PYTHON_API_ZIP_PATH},${BigDL_HOME}/dl/src/main/python/models/lenet/lenet5.py \ + --properties-file ${BigDL_HOME}/dist/conf/spark-bigdl.conf \ + --jars ${BigDL_JAR_PATH} \ + --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \ + --conf spark.executor.extraClassPath=bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar \ + ${BigDL_HOME}/dl/src/main/python/models/lenet/lenet5.py + ``` + + +## Launch Jupyter on standalone cluster + + ``` + BigDL_HOME=... + SPARK_HOME=... + MASTER=... + PYTHON_API_ZIP_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-python-api.zip + BigDL_JAR_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar + + export PYTHONPATH=${PYTHON_API_ZIP_PATH}:$PYTHONPATH + export IPYTHON_OPTS="notebook --notebook-dir=./ --ip=* --no-browser" + + ${SPARK_HOME}/bin/pyspark \ + --master ${MASTER} \ + --properties-file ${BigDL_HOME}/dist/conf/spark-bigdl.conf + --driver-cores 5 \ + --driver-memory 10g \ + --total-executor-cores 8 \ + --executor-cores 1 \ + --executor-memory 20g \ + --conf spark.akka.frameSize=64 \ + --py-files ${PYTHON_API_ZIP_PATH} \ + --jars ${BigDL_JAR_PATH} \ + --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \ + --conf spark.executor.extraClassPath=bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar + ``` From 27fa496c6617a6356ecee67f4ea7cc0cf9092005 Mon Sep 17 00:00:00 2001 From: zhangli Date: Wed, 22 Mar 2017 10:20:26 +0800 Subject: [PATCH 2/3] change wiki link after code restructuring --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a94e306c..ed72d7ea 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,13 @@ RDD[..] --transform-->RDD[ndarray, ndarray].map(Sample.from_ndarray(features, la ## Run python test * Package Scala code by: ```$BigDL_HOME/make-dist.sh``` -* Set SPARK_HOME and then run: ```$BigDL_HOME/dl/src/main/python/dev/run-all.sh``` +* Set SPARK_HOME and then run: ```$BigDL_HOME/pyspark/dev/run-all.sh``` ## Installing on Ubuntu 1. Build BigDL [Build Page](https://github.com/intel-analytics/BigDL/wiki/Build-Page) - * With Spark1.6: ``` $BIGDL_HOME//make-dist.sh ``` - * With Spark2.0: ``` $BIGDL_HOME//make-dist.sh -P spark_2.0 ``` + * With Spark1.6: ``` $BIGDL_HOME/make-dist.sh ``` + * With Spark2.0: ``` $BIGDL_HOME/make-dist.sh -P spark_2.0 ``` 2. Install python dependensies: * Installing Numpy: @@ -90,12 +90,12 @@ RDD[..] --transform-->RDD[ndarray, ndarray].map(Sample.from_ndarray(features, la --executor-cores 10 \ --executor-memory 20g \ --conf spark.akka.frameSize=64 \ - --py-files ${PYTHON_API_ZIP_PATH},${BigDL_HOME}/dl/src/main/python/models/lenet/lenet5.py \ + --py-files ${PYTHON_API_ZIP_PATH},${BigDL_HOME}/pyspark/bigdl/models/lenet/lenet5.py \ --properties-file ${BigDL_HOME}/dist/conf/spark-bigdl.conf \ --jars ${BigDL_JAR_PATH} \ --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \ --conf spark.executor.extraClassPath=bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar \ - ${BigDL_HOME}/dl/src/main/python/models/lenet/lenet5.py + ${BigDL_HOME}/pyspark/bigdl/models/lenet/lenet5.py ``` From 1c38bb6a4a6416a2db625889a45121e16a5c329d Mon Sep 17 00:00:00 2001 From: zhichao-li Date: Tue, 21 Mar 2017 16:06:55 +0800 Subject: [PATCH 3/3] add Tensor wrapper add CrossEntropyCriterion --- README.md | 127 ------------------------------------------------------ 1 file changed, 127 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index ed72d7ea..00000000 --- a/README.md +++ /dev/null @@ -1,127 +0,0 @@ - -BigDL comes with scala API for now. However Python is a powerful programming language for data analysis and with large amount of useful libraries, we are developing a lightweight python binding on top of PySpark which can enable us use Python naively with BigDL. - -The Python API is almost identical to the scala version, and it would map ndarray to tensor for the training samples, so basically user only need to care about how to manipulate ndarray. - -This Python binding tested with Python 2.7 and Spark 1.6.0. - -Here are the steps for training a simple LeNet model: - -1). Create a RDD[Sample]: -``` -RDD[..] --transform-->RDD[ndarray, ndarray].map(Sample.from_ndarray(features, label)) --> RDD[Sample] -``` - -2). Define a model: -``` - def build_model(class_num): - model = Sequential() - model.add(Reshape([1, 28, 28])) - model.add(SpatialConvolution(1, 6, 5, 5)) - model.add(Tanh()) - model.add(SpatialMaxPooling(2, 2, 2, 2)) - model.add(Tanh()) - model.add(SpatialConvolution(6, 12, 5, 5)) - model.add(SpatialMaxPooling(2, 2, 2, 2)) - model.add(Reshape([12 * 4 * 4])) - model.add(Linear(12 * 4 * 4, 100)) - model.add(Tanh()) - model.add(Linear(100, class_num)) - model.add(LogSoftMax()) - return model - ``` - -3). Create Optimizer and train: -``` - optimizer = Optimizer( - model=build_model(10), - training_rdd=train_data, - criterion=ClassNLLCriterion(), - optim_method="SGD", - state=state, - end_trigger=MaxEpoch(100), - batch_size=int(options.batchSize)) - optimizer.setvalidation( - batch_size=32, - val_rdd=test_data, - trigger=EveryEpoch(), - val_method=["top1"] - ) - optimizer.setcheckpoint(EveryEpoch(), "/tmp/lenet5/") - trained_model = optimizer.optimize() -``` - -4) LeNet example can be found from: models/lenet5.py - -## Run python test -* Package Scala code by: ```$BigDL_HOME/make-dist.sh``` -* Set SPARK_HOME and then run: ```$BigDL_HOME/pyspark/dev/run-all.sh``` - -## Installing on Ubuntu -1. Build BigDL -[Build Page](https://github.com/intel-analytics/BigDL/wiki/Build-Page) - * With Spark1.6: ``` $BIGDL_HOME/make-dist.sh ``` - * With Spark2.0: ``` $BIGDL_HOME/make-dist.sh -P spark_2.0 ``` - -2. Install python dependensies: - * Installing Numpy: - ```sudo apt-get install python-numpy``` - - * Installing Python setuptools: - ```sudo apt-get install -y python-setuptools python-pip``` - - * Install Jupyter: - ```sudo pip install jupyter``` - -## Run a Lenet example on standalone cluster - - ``` - BigDL_HOME=... - SPARK_HOME=... - MASTER=... - PYTHON_API_ZIP_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-python-api.zip - BigDL_JAR_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar - PYTHONPATH=${PYTHON_API_ZIP_PATH}:$PYTHONPATH - ${SPARK_HOME}/bin/spark-submit \ - --master ${MASTER} \ - --driver-cores 5 \ - --driver-memory 10g \ - --total-executor-cores 80 \ - --executor-cores 10 \ - --executor-memory 20g \ - --conf spark.akka.frameSize=64 \ - --py-files ${PYTHON_API_ZIP_PATH},${BigDL_HOME}/pyspark/bigdl/models/lenet/lenet5.py \ - --properties-file ${BigDL_HOME}/dist/conf/spark-bigdl.conf \ - --jars ${BigDL_JAR_PATH} \ - --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \ - --conf spark.executor.extraClassPath=bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar \ - ${BigDL_HOME}/pyspark/bigdl/models/lenet/lenet5.py - ``` - - -## Launch Jupyter on standalone cluster - - ``` - BigDL_HOME=... - SPARK_HOME=... - MASTER=... - PYTHON_API_ZIP_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-python-api.zip - BigDL_JAR_PATH=${BigDL_HOME}/dist/lib/bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar - - export PYTHONPATH=${PYTHON_API_ZIP_PATH}:$PYTHONPATH - export IPYTHON_OPTS="notebook --notebook-dir=./ --ip=* --no-browser" - - ${SPARK_HOME}/bin/pyspark \ - --master ${MASTER} \ - --properties-file ${BigDL_HOME}/dist/conf/spark-bigdl.conf - --driver-cores 5 \ - --driver-memory 10g \ - --total-executor-cores 8 \ - --executor-cores 1 \ - --executor-memory 20g \ - --conf spark.akka.frameSize=64 \ - --py-files ${PYTHON_API_ZIP_PATH} \ - --jars ${BigDL_JAR_PATH} \ - --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \ - --conf spark.executor.extraClassPath=bigdl-0.1.0-SNAPSHOT-jar-with-dependencies.jar - ```