From 3b9fe4de06f9e73b1157b040a52a628937dd7ca3 Mon Sep 17 00:00:00 2001
From: Ayo <ayo@ayco.io>
Date: Tue, 2 Sep 2025 21:21:02 +0200
Subject: [PATCH] feat: add inference examples

---
 README.md                  | 16 ++++------------
 amp-inference.py           | 18 ++++++++++++++++++
 env.sh                     |  5 +++++
 fp32-inference.py          | 15 +++++++++++++++
 main.py                    | 15 ++++++++++++++-
 torch-compile-inference.py | 30 ++++++++++++++++++++++++++++++
 6 files changed, 86 insertions(+), 13 deletions(-)
 create mode 100644 amp-inference.py
 create mode 100644 env.sh
 create mode 100644 fp32-inference.py
 create mode 100644 torch-compile-inference.py

diff --git a/README.md b/README.md
index dfd2adc..c66dbfc 100644
--- a/README.md
+++ b/README.md
@@ -6,21 +6,13 @@ After installing `ipex-llm` which is required to use Intel GPUs (see documentati
 
 ## Setup
 
-1. Activate the conda environment
+1. Run `env.sh` to activate the conda environment and set
 
 ```bash
-conda activate llm-pt26
+$ . env.sh
 ```
 
-2. Set the necessary environmental variables:
-
-```bash
-unset OCL_ICD_VENDORS
-export SYCL_CACHE_PERSISTENT=1
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-```
-
-3. (Optional) Confirm if XPU is detected
+2. (Optional) Confirm if XPU is detected
 
 ```bash
 $ python # go intou the python shell
@@ -31,7 +23,7 @@ $ torch.xpu.get_device_name()
 
 ```
 
-Links
+## Links
 
 - [Install IPEX-LLM on Intel GPU with PyTorch 2.6](https://git.ayo.run/ayo/ipex-llm/src/branch/main/docs/mddocs/Quickstart/install_pytorch26_gpu.md)
 - [Get started with PyTorch locally](https://pytorch.org/get-started/locally/)
diff --git a/amp-inference.py b/amp-inference.py
new file mode 100644
index 0000000..b3bb2a6
--- /dev/null
+++ b/amp-inference.py
@@ -0,0 +1,18 @@
+import torch
+import torchvision.models as models
+
+model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+model.eval()
+data = torch.rand(1, 3, 224, 224)
+
+model = model.to("xpu")
+data = data.to("xpu")
+
+with torch.no_grad():
+    d = torch.rand(1, 3, 224, 224)
+    d = d.to("xpu")
+    # set dtype=torch.bfloat16 for BF16
+    with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=True):
+        model(data)
+
+print("Execution finished")
\ No newline at end of file
diff --git a/env.sh b/env.sh
new file mode 100644
index 0000000..67d5e9c
--- /dev/null
+++ b/env.sh
@@ -0,0 +1,5 @@
+conda activate llm-pt26
+
+unset OCL_ICD_VENDORS
+export SYCL_CACHE_PERSISTENT=1
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
diff --git a/fp32-inference.py b/fp32-inference.py
new file mode 100644
index 0000000..df87ad2
--- /dev/null
+++ b/fp32-inference.py
@@ -0,0 +1,15 @@
+import torch
+import torchvision.models as models
+
+model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+model.eval()
+data = torch.rand(1, 3, 224, 224)
+
+model = model.to("xpu")
+data = data.to("xpu")
+
+with torch.no_grad():
+    something = model(data)
+    print(something)
+
+print("Execution finished")
diff --git a/main.py b/main.py
index 9c6de66..7d7c372 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,16 @@
 import torch
-x = torch.rand(5, 3)
+
+# tensor_1 = torch.randn(1, 1, 40, 128).to('xpu')
+# tensor_2 = torch.randn(1, 1, 128, 40).to('xpu')
+# print(tensor_1)
+# print(tensor_2)
+# print(torch.matmul(tensor_1, tensor_2).size())
+
+print(torch.xpu.is_available())
+print(torch.xpu.get_device_name(0))
+
+x = torch.rand(5, 99999).to('xpu')
 print(x)
+
+print(torch.xpu.memory_allocated())
+print(torch.xpu.memory_reserved())
diff --git a/torch-compile-inference.py b/torch-compile-inference.py
new file mode 100644
index 0000000..d9508c7
--- /dev/null
+++ b/torch-compile-inference.py
@@ -0,0 +1,30 @@
+import torch
+import torchvision.models as models
+import time
+
+model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+model.eval()
+data = torch.rand(1, 3, 224, 224)
+ITERS = 10
+
+model = model.to("xpu")
+data = data.to("xpu")
+
+for i in range(ITERS):
+    start = time.time()
+    with torch.no_grad():
+        model(data)
+        torch.xpu.synchronize()
+    end = time.time()
+    print(f"Inference time before torch.compile for iteration {i}: {(end-start)*1000} ms")
+
+model = torch.compile(model)
+for i in range(ITERS):
+    start = time.time()
+    with torch.no_grad():
+        model(data)
+        torch.xpu.synchronize()
+    end = time.time()
+    print(f"Inference time after torch.compile for iteration {i}: {(end-start)*1000} ms")
+
+print("Execution finished")
\ No newline at end of file