Complete BigQuery ML Tutorial: Machine Learning with SQL
BigQuery ML enables you to create and execute machine learning models using SQL queries. No data movement required - train and predict directly on your data warehouse.
Why BigQuery ML?
Key Benefits:- SQL-based: Use familiar SQL syntax
- No data movement: Train on data in place
- Scalable: Leverage BigQuery infrastructure
- Integrated: Works with BigQuery ecosystem
- Multiple models: Classification, regression, clustering
- Linear/Logistic Regression
- XGBoost, Random Forest
- Deep Neural Networks
- K-means Clustering
- Time Series (ARIMA)
- Matrix Factorization
Prerequisites
pip install google-cloud-bigquery
gcloud auth login
Quick Start
1. Create Classification Model
-- Create logistic regression model
CREATE OR REPLACE MODEL project.dataset.churnmodel
OPTIONS(
modeltype='LOGISTICREG',
inputlabelcols=['churn'],
autoclassweights=TRUE
) AS
SELECT
age,
tenuremonths,
monthlycharges,
totalcharges,
contracttype,
churn
FROM project.dataset.customers
WHERE PARTITIONTIME BETWEEN '2024-01-01' AND '2024-06-01';
2. Evaluate Model
-- Evaluate model performance
SELECT
FROM
ML.EVALUATE(MODEL project.dataset.churnmodel,
(SELECT FROM project.dataset.customers
WHERE PARTITIONTIME > '2024-06-01'));
3. Make Predictions
-- Predict churn probability
SELECT
customerid,
predictedchurn,
predictedchurnprobs
FROM
ML.PREDICT(MODEL project.dataset.churnmodel,
(SELECT FROM project.dataset.newcustomers
));
Model Types
1. XGBoost Classifier
CREATE OR REPLACE MODEL project.dataset.xgbclassifier
OPTIONS(
modeltype='BOOSTEDTREECLASSIFIER',
numparalleltree=5,
maxiterations=50,
learnrate=0.1,
inputlabelcols=['target']
) AS
SELECT FROM project.dataset.trainingdata;
2. Random Forest
CREATE OR REPLACE MODEL project.dataset.rfmodel
OPTIONS(
modeltype='RANDOMFORESTCLASSIFIER',
numparalleltree=100,
maxtreedepth=10,
inputlabelcols=['label']
) AS
SELECT FROM project.dataset.trainingdata
;
3. Deep Neural Network
CREATE OR REPLACE MODEL project.dataset.dnnmodel
OPTIONS(
modeltype='DNNCLASSIFIER',
hiddenunits=[128, 64, 32],
dropout=0.2,
batchsize=256,
inputlabelcols=['label']
) AS
SELECT FROM project.dataset.trainingdata;
4. Linear Regression
CREATE OR REPLACE MODEL project.dataset.pricepredictor
OPTIONS(
modeltype='LINEARREG',
inputlabelcols=['price'],
l2reg=0.1
) AS
SELECT
bedrooms,
bathrooms,
sqft,
location,
price
FROM project.dataset.houses;
5. K-Means Clustering
CREATE OR REPLACE MODEL project.dataset.customersegments
OPTIONS(
modeltype='KMEANS',
numclusters=5,
standardizefeatures=TRUE
) AS
SELECT
age,
income,
spendingscore,
purchasefrequency
FROM project.dataset.customers;
-- Get cluster assignments
SELECT
customerid,
CENTROIDID as segment
FROM
ML.PREDICT(MODEL project.dataset.customersegments,
(SELECT FROM project.dataset.customers));