Complete Ray Serve Tutorial: Scalable ML Model Serving
Ray Serve is a scalable model serving library built on Ray. It enables you to serve ML models with automatic scaling, batching, and multi-model composition, making it ideal for production ML deployments.
Why Ray Serve?
Ray Serve Advantages:- Framework agnostic: Works with any ML framework
- Scalable: Automatic scaling based on load
- Composable: Combine multiple models easily
- Batching: Automatic request batching
- Native Python: Simple Python-first API
- Model serving at scale
- Multi-model pipelines
- A/B testing
- Real-time inference
- Batch inference
Installation
pip install "ray[serve]"
Verify installation
python -c "import ray; from ray import serve; print(ray.version)"
Quick Start
1. Basic Deployment
from ray import serve
import ray
ray.init()
serve.start()
@serve.deployment
class ModelDeployment:
def init(self):
self.model = "simplemodel"
def call(self, request):
return {"message": f"Processed by {self.model}"}
Deploy
ModelDeployment.deploy()
Test
import requests
response = requests.get("http://localhost:8000/ModelDeployment")
print(response.json())
2. With FastAPI
from ray import serve
from fastapi import FastAPI
import ray
app = FastAPI()
@serve.deployment
@serve.ingress(app)
class MLService:
def init(self):
self.model = self.loadmodel()
def loadmodel(self):
return "mymodel"
@app.get("/predict")
def predict(self, text: str):
return {"prediction": f"Result for: {text}"}
@app.get("/health")
def health(self):
return {"status": "healthy"}
ray.init()
serve.run(MLService.bind())
3. Serve ML Model
from ray import serve
import ray
import pickle
import numpy as np
@serve.deployment
class SklearnModel:
def init(self, modelpath: str):
with open(modelpath, "rb") as f:
self.model = pickle.load(f)
async def call(self, request):
data = await request.json()
features = np.array(data["features"]).reshape(1, -1)
prediction = self.model.predict(features)
return {"prediction": prediction.tolist()}
ray.init()
serve.run(SklearnModel.bind(modelpath="model.pkl"))
Deployment Configuration
1. Resource Allocation
from ray import serve
@serve.deployment(
numreplicas=3,
rayactoroptions={
"numcpus": 2,
"numgpus": 1,
"memory": 4 1024 1024 1024 # 4GB
}
)
class GPUModel:
def init(self):
import torch
self.device = torch.device("cuda")
self.model = self.loadmodel()
def loadmodel(self):
import torch
model = torch.nn.Linear(10, 2)
return model.to(self.device)
async def call(self, request):
import torch
data = await request.json()
tensor = torch.tensor(data["input"]).to(self.device)
output = self.model(tensor)
return {"output": output.cpu().tolist()}
2. Autoscaling
from ray import serve
from ray.serve.config import AutoscalingConfig
@serve.deployment(
autoscalingconfig=AutoscalingConfig(
minreplicas=1,
maxreplicas=10,
targetnumongoingrequestsperreplica=5,
upscaledelays=10,
downscaledelays=30
)
)
class AutoscaledModel:
def init(self):
self.model = "autoscaledmodel"
async def call(self, request):