K8S 离线部署 Dask 分布式集群及 JupyterHub
k8s 公网部署Dask 及 jupyterHub请看上篇文章:Dask 分布式集群部署 (物理机和 k8s) 及实战
环境说明
- k8s version ==v1.20.5
- dask-kubernetes-operator==2024.5.0
- jupyterHub==2.0.0(python-version=?)
一、部署dask-operator
离线安装
# 离线安装dask-operator
helm repo add dask https://helm.dask.org
helm repo update
mkdir /work/helm-charts
cd /work/helm-charts
# 下载为离线的dask-kubernetes-operator
helm pull dask/dask-kubernetes-operator
ls -l
dask-kubernetes-operator-2024.5.0.tgz
# 解压缩
tar -zxvf dask-kubernetes-operator-2024.5.0.tgz
# 拉取镜像
docker pull ghcr.io/dask/dask-kubernetes-operator:2024.5.0
# 打tag
docker tag ghcr.io/dask/dask-kubernetes-operator:2024.5.0 baidu-harbor.baidu.com/deeplearning/dask-kubernetes-operator:2024.5.0
# 推送镜像
docker push baidu-harbor.baidu.com/deeplearning/dask-kubernetes-operator:2024.5.0
# 重新安装
helm install dask-kubernetes-operator ./dask-kubernetes-operator --create-namespace -n dask-operator
# 查看操作器是否安装成功
kubectl get pods -n dask-operator
[root@192.168.1.101 work]#kubectl get pods -n dask-operator
NAME READY STATUS RESTARTS AGE
dask-kubernetes-operator-686bfb958f-kscjf 1/1 Running 0 2m18s
二、部署jupyterhub
1、添加jupyter 仓库
helm repo add jupyterhub https://hub.jupyter.org/helm-chart/
helm repo update
2、下载helm-chart 下载到本地
# 查看jupyter版本 https://hub.jupyter.org/helm-chart/
# 指定版本, 3.3.7 需要k8s version >=1.23.0-0
# helm pull jupyterhub/jupyterhub --version 3.3.7
# 当前(2024.06.23)默认拉取的就是最新版本 3.3.7
# jupyterhub 2.0.0, k8s version >= 1.20.0
# 将 helm chart 包下载下来,然后 cat Chart.yaml 可查看相关依赖版本
helm pull jupyterhub/jupyterhub --version 2.0.0
[root@192.168.1.101 work]#ls -l
total 88
drwxr-xr-x 4 root root 108 Jun 22 18:11 dask-kubernetes-operator
-rw-r--r-- 1 root root 40432 Jun 22 17:58 dask-kubernetes-operator-2024.5.0.tgz
-rw-r--r-- 1 root root 45296 Jun 23 12:42 jupyterhub-2.0.0.tgz
3、解压
[root@192.168.1.101 work]# tar -zxvf jupyterhub-2.0.0.tgz
4、查看 jupyterhub依赖的镜像:
cd jupyterhub/
[root@192.168.1.101 jupyterhub]#cat Chart.yaml
annotations:
artifacthub.io/images: |
- image: jupyterhub/configurable-http-proxy:4.5.3
name: configurable-http-proxy
- image: jupyterhub/k8s-hub:2.0.0
name: k8s-hub
- image: jupyterhub/k8s-image-awaiter:2.0.0
name: k8s-image-awaiter
- image: jupyterhub/k8s-network-tools:2.0.0
name: k8s-network-tools
- image: jupyterhub/k8s-secret-sync:2.0.0
name: k8s-secret-sync
- image: jupyterhub/k8s-singleuser-sample:2.0.0
name: k8s-singleuser-sample
- image: k8s.gcr.io/kube-scheduler:v1.23.10
name: kube-scheduler
- image: k8s.gcr.io/pause:3.8
name: pause
- image: k8s.gcr.io/pause:3.8
name: pause
- image: traefik:v2.8.4
name: traefik
apiVersion: v2
appVersion: 3.0.0
description: Multi-user Jupyter installation
home: https://z2jh.jupyter.org
icon: https://jupyterhub.github.io/helm-chart/images/hublogo.svg
keywords:
- jupyter
- jupyterhub
- z2jh
kubeVersion: '>=1.20.0-0'
maintainers:
- email: erik@sundellopensource.se
name: Erik Sundell
- name: Simon Li
url: https://github.com/manics/
name: jupyterhub
sources:
- https://github.com/jupyterhub/zero-to-jupyterhub-k8s
version: 2.0.0
[root@192.168.1.101 jupyterhub]#
手动下载镜像到本地:
添加国内镜像库:
vim /etc/docker/daemon.json
{
"registry-mirrors" : [
"https://registry.docker-cn.com",
"http://hub-mirror.c.163.com",
"https://docker.mirrors.ustc.edu.cn",
"https://cr.console.aliyun.com",
"https://mirror.ccs.tencentyun.com"
]
}
重启:
systemctl daemon-reload
systemctl restart docker.service
拉取镜像:
# docker pull jupyterhub/configurable-http-proxy:4.5.3
docker pull quay.io/jupyterhub/configurable-http-proxy:4.5.3 (ok)
# docker pull jupyterhub/k8s-hub:2.0.0
docker pull quay.io/jupyterhub/k8s-hub:2.0.0 (ok)
docker pull jupyterhub/k8s-image-awaiter:2.0.0 (ok)
docker pull jupyterhub/k8s-network-tools:2.0.0 (ok)
# docker pull jupyterhub/k8s-secret-sync:2.0.0
docker pull quay.io/jupyterhub/k8s-secret-sync:2.0.0 (ok)
# docker pull jupyterhub/k8s-singleuser-sample:2.0.0
# docker pull quay.io/jupyterhub/k8s-singleuser-sample:2.0.0 (x)
docker pull jupyterhub/k8s-singleuser-sample:2.0.0 (mac, ok)
# docker pull k8s.gcr.io/kube-scheduler:v1.23.10 (mac, ok)
docker pull k8s.gcr.io/kube-scheduler:v1.20.15 (mac, ok)
docker pull k8s.gcr.io/pause:3.8 (mac, ok)
docker pull traefik:v2.8.4 (mac, ok)
将镜像保存:
# 保存
docker save > k8s-singleuser-sample-2.0.0.tar jupyterhub/k8s-singleuser-sample:2.0.0
# docker save > kube-scheduler-v1.23.10.tar k8s.gcr.io/kube-scheduler:v1.23.10
docker save > kube-scheduler-v1.20.15.tar k8s.gcr.io/kube-scheduler:v1.20.15
docker save > pause-3.8.tar k8s.gcr.io/pause:3.8
docker save > traefik-v2.8.4.tar traefik:v2.8.4
# 压缩
tar -czvf jupyterhub-yilai.tar.gz ./images
# 导入
docker load < k8s-singleuser-sample-2.0.0.tar
docker load < kube-scheduler-v1.20.15.tar
docker load < pause-3.8.tar
docker load < traefik-v2.8.4.tar
打tag,然后推送本地仓库:
docker tag quay.io/jupyterhub/configurable-http-proxy:4.5.3 magic-harbor.magic.com/jupyterhub/configurable-http-proxy:4.5.3
docker push magic-harbor.magic.com/jupyterhub/configurable-http-proxy:4.5.3
docker tag quay.io/jupyterhub/k8s-hub:2.0.0 magic-harbor.magic.com/jupyterhub/k8s-hub:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-hub:2.0.0
docker tag jupyterhub/k8s-image-awaiter:2.0.0 magic-harbor.magic.com/jupyterhub/k8s-image-awaiter:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-image-awaiter:2.0.0
docker tag jupyterhub/k8s-network-tools:2.0.0 magic-harbor.magic.com/jupyterhub/k8s-network-tools:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-network-tools:2.0.0
docker tag quay.io/jupyterhub/k8s-secret-sync:2.0.0 magic-harbor.magic.com/jupyterhub/k8s-secret-sync:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-secret-sync:2.0.0
---
docker tag jupyterhub/k8s-singleuser-sample:2.0.0 magic-harbor.magic.com/jupyterhub/k8s-singleuser-sample:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-singleuser-sample:2.0.0
docker tag k8s.gcr.io/kube-scheduler:v1.20.15 magic-harbor.magic.com/jupyterhub/kube-scheduler:v1.20.15
docker push magic-harbor.magic.com/jupyterhub/kube-scheduler:v1.20.15
docker tag k8s.gcr.io/pause:3.8 magic-harbor.magic.com/jupyterhub/pause:3.8
docker push magic-harbor.magic.com/jupyterhub/pause:3.8
docker tag traefik:v2.8.4 magic-harbor.magic.com/jupyterhub/traefik:v2.8.4
docker push magic-harbor.magic.com/jupyterhub/traefik:v2.8.4
docker pull ghcr.io/dask/dask:latest
docker tag ghcr.io/dask/dask:latest magic-harbor.magic.com/jupyterhub/dask:latest
docker push magic-harbor.magic.com/jupyterhub/dask:latest
修改配置文件的镜像地址:
# 重新安装
# helm install dask-kubernetes-operator ./dask-kubernetes-operator --create-namespace -n dask-operator
# 创建jhub命名空间
kubectl create ns jhub
cd /work
---
helm upgrade --cleanup-on-fail \
--install jupyterhub-release ./jupyterhub \
--namespace jhub \
--values config.yaml
config.yaml 文件
hub:
config:
Authenticator:
admin_users: # jupyterhub 的管理员用户
- admin
allowed_users: # 不配置的话,允许用户注册
- kaiyi
DummyAuthenticator:
password: hub123 #配置通用密码(基本没什么安全性)
JupyterHub:
admin_access: true #配置是否允许管理者账户存在
authenticator_class: dummy #指定所有账户授权类型(默认是傻瓜式)
部署结果:
[root@192.168.1.101 work]# helm upgrade --cleanup-on-fail \
> --install jupyterhub-release ./jupyterhub \
> --namespace jhub \
> --values config.yaml
Release "jupyterhub-release" has been upgraded. Happy Helming!
NAME: jupyterhub-release
LAST DEPLOYED: Wed Jun 26 02:04:05 2024
NAMESPACE: jhub
STATUS: deployed
REVISION: 2
TEST SUITE: None
NOTES:
. __ __ __ __ __
/ / __ __ ____ __ __ / /_ ___ _____ / / / / __ __ / /_
__ / / / / / / / __ \ / / / / / __/ / _ \ / ___/ / /_/ / / / / / / __ \
/ /_/ / / /_/ / / /_/ / / /_/ / / /_ / __/ / / / __ / / /_/ / / /_/ /
\____/ \__,_/ / .___/ \__, / \__/ \___/ /_/ /_/ /_/ \__,_/ /_.___/
/_/ /____/
You have successfully installed the official JupyterHub Helm chart!
### Installation info
- Kubernetes namespace: jhub
- Helm release name: jupyterhub-release
- Helm chart version: 2.0.0
- JupyterHub version: 3.0.0
[root@192.168.1.101 work]#
修改 Jupyterhub svc 的服务,将 改为 NodePort
[root@192.168.1.101 images]#kubectl get svc -n jhub
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
hub ClusterIP 10.68.203.41 <none> 8081/TCP 19m
proxy-api ClusterIP 10.68.61.94 <none> 8001/TCP 19m
proxy-public LoadBalancer 10.68.119.149 <pending> 80:32363/TCP 19m
[root@192.168.1.101 images]#kubectl edit svc proxy-public -n jhub
service/proxy-public edited
[root@192.168.1.101 images]#
http://192.168.1.101:32363/hub/login?next=%2Fhub%2F
通过 admin/hub123 登录 Jupyterhub
[root@192.168.1.101 images]#kubectl get pods -n jhub
NAME READY STATUS RESTARTS AGE
continuous-image-puller-9tzj7 1/1 Running 0 23m
continuous-image-puller-g6kf5 1/1 Running 0 23m
continuous-image-puller-jk6fx 1/1 Running 0 23m
continuous-image-puller-n4bn8 1/1 Running 0 23m
continuous-image-puller-qwqvt 1/1 Running 0 23m
continuous-image-puller-zqbm7 1/1 Running 0 23m
hub-6d4978fbfb-hnv57 1/1 Running 0 23m
jupyter-admin 1/1 Running 0 49s
proxy-77f44ff6b7-6p7mv 1/1 Running 0 23m
user-scheduler-65b67b4c9-bmmxc 1/1 Running 0 4m54s
user-scheduler-65b67b4c9-wkp2d 1/1 Running 0 5m28s
[root@192.168.1.101 images]#
安装依赖
在jupyterhub 中安装 dask-kubernetes 依赖:
# pip install dask-kubernetes==2023.1.0
!pip install dask-kubernetes==2023.1.0
查看安装的依赖:
$ pip list | grep dask
dask 2024.6.2
dask-kubernetes 2023.1.0
测试:
# 是否能启动 dask_kubernetes 服务
from dask_kubernetes.operator import KubeCluster
cluster = KubeCluster(name="my-dask-cluster", image='ghcr.io/dask/dask:latest')
# cluster.scale(10)
cluster.scale(3)
问题处理
# 查看节点标签
kubectl get node 192.168.1.101 --show-labels
# 查看污点
[root@192.168.1.101 notebook]#kubectl describe nodes 192.168.1.101 | grep Taints
Taints: gpu=gpu:NoSchedule
# 节点选择
nodeSelector: # 节点label选择
gpucore: gpu
tolerations: # 容忍度,不允许其他pod调度该节点,只允许配置该参数的;
- key: gpu
operator: Equal
value: gpu
effect: NoSchedule
修改 jupyterhub/values.yaml,指定 jupyter-admin pod调度到可以访问公网的节点,增加 nodeSelector 和容忍度:
# singleuser relates to the configuration of KubeSpawner which runs in the
hub
# pod, and its spawning of user pods such as jupyter-myusername.
singleuser:
podNameTemplate:
extraTolerations:
- key: gpu
operator: Equal
value: gpu
effect: NoSchedule
nodeSelector:
gpucore: gpu
删除节点污点:
kubectl taint nodes 192.168.1.101 key1-
查看pod是否调度到 101节点:
[root@192.168.1.101 work]#kubectl get pods -n jhub -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATEScontinuous-image-puller-psdd8 1/1 Running 0 15m 172.20.160.144 192.168.1.101 <none> <none>
hub-7fdfb4d57c-sbckk 1/1 Running 0 23m 172.20.193.36 192.168.1.35 <none> <none>
jupyter-admin 1/1 Running 0 15m 172.20.160.145 192.168.1.101 <none> <none>
proxy-77f44ff6b7-6p7mv 1/1 Running 0 9h 172.20.193.38 192.168.1.35 <none> <none>
user-scheduler-65b67b4c9-bmmxc 1/1 Running 0 9h 172.20.193.46 192.168.1.35 <none> <none>
user-scheduler-65b67b4c9-wkp2d 1/1 Running 0 9h 172.20.193.6 192.168.1.35 <none> <none>
[root@192.168.1.101 work]#
自定义资源
cluster.yaml
# cluster.yaml
apiVersion: kubernetes.dask.org/v1
kind: DaskCluster
metadata:
name: simple
namespace: jhub
spec:
worker:
replicas: 2
spec:
containers:
- name: worker
image: "magic-harbor.magic.com/jupyterhub/dask:latest"
imagePullPolicy: "IfNotPresent"
args:
- dask-worker
- --name
- $(DASK_WORKER_NAME)
- --dashboard
- --dashboard-address
- "8788"
ports:
- name: http-dashboard
containerPort: 8788
protocol: TCP
scheduler:
spec:
containers:
- name: scheduler
image: "magic-harbor.magic.com/jupyterhub/dask:latest"
imagePullPolicy: "IfNotPresent"
args:
- dask-scheduler
ports:
- name: tcp-comm
containerPort: 8786
protocol: TCP
- name: http-dashboard
containerPort: 8787
protocol: TCP
readinessProbe:
httpGet:
port: http-dashboard
path: /health
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
port: http-dashboard
path: /health
initialDelaySeconds: 15
periodSeconds: 20
service:
type: NodePort
selector:
dask.org/cluster-name: simple
dask.org/component: scheduler
ports:
- name: tcp-comm
protocol: TCP
port: 8786
targetPort: "tcp-comm"
- name: http-dashboard
protocol: TCP
port: 8787
targetPort: "http-dashboard"
[root@192.168.1.101 work]#kubectl get pods -n jhub
NAME READY STATUS RESTARTS AGE
continuous-image-puller-psdd8 1/1 Running 0 13h
hub-7fdfb4d57c-sbckk 1/1 Running 0 13h
jupyter-admin 1/1 Running 0 62m
proxy-77f44ff6b7-6p7mv 1/1 Running 0 23h
simple-default-worker-593ece0aaf-68c64cb45d-cdgpz 1/1 Running 0 2m49s
simple-default-worker-6967e21d23-987cdf69d-gqbdq 1/1 Running 0 2m49s
simple-scheduler-6d98bb4684-v9rhk 1/1 Running 0 2m49s
user-scheduler-65b67b4c9-bmmxc 1/1 Running 0 23h
user-scheduler-65b67b4c9-wkp2d 1/1 Running 0 23h
[root@192.168.1.101 work]#
[root@192.168.1.101 work]#kubectl get svc -n jhub
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
hub ClusterIP 10.68.203.41 <none> 8081/TCP 23h
proxy-api ClusterIP 10.68.61.94 <none> 8001/TCP 23h
proxy-public LoadBalancer 10.68.119.149 <pending> 80:32363/TCP 23h
simple-scheduler NodePort 10.68.47.69 <none> 8786:31026/TCP,8787:31135/TCP 93s
内部域名:
simple-scheduler.jhub.svc.cluster.local
10.233.57.184:8786
from dask.distributed import Client
client = Client("10.68.47.69:8786")
修改singleuser 网络策略
由于默认 jupyterhub 有开启网络策略 NetworkPolicy,所以在 jupyter中调用dask pod 服务 10.68.47.69:8786
会报错,所以,需要关闭 网络策略:
cd /work/jupyterhub
vim values.yaml
----
# pod, and its spawning of user pods such as jupyter-myusername.
singleuser:
podNameTemplate:
extraTolerations:
- key: gpu
operator: Equal
value: gpu
effect: NoSchedule
nodeSelector:
gpucore: gpu
extraNodeAffinity:
required: []
preferred: []
...
networkPolicy:
enabled: false # 修改此处,将默认的 true改为 false
ingress: []
egress: []
egressAllowRules:
cloudMetadataServer: false
dnsPortsPrivateIPs: true
nonPrivateIPs: true
privateIPs: false
interNamespaceAccessLabels: ignore
allowedIngressPorts: []
修改之后然后重新部署jupyterhub:
helm upgrade --cleanup-on-fail \
--install jupyterhub-release ./jupyterhub \
--namespace jhub \
--values config.yaml
然后在jupyter 中运行分布式dask:
from dask.distributed import Client
# 使用dask svc域名访问
client = Client("simple-scheduler.jhub.svc.cluster.local:8786")
# client = Client("10.68.47.69:8786")
print(client)
测试:
def square(x):
return x ** 2
def neg(x):
return -x
A = client.map(square, range(10))
B = client.map(neg, A)
total = client.submit(sum, B)
total.result()
分布式建模
https://examples.dask.org/machine-learning/xgboost.html
安装 dask_ml 依赖:
pip install dask_ml
# pip install dask-ml==2023.3.24
# !pip list | grep dask-ml
机器学习模型:
from dask.distributed import Client
from dask_ml.datasets import make_classification
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
from dask_ml.metrics import accuracy_score
# 连接到 Dask 分布式集群
client = Client("simple-scheduler.jhub.svc.cluster.local:8786")
print(client)
# 生成示例数据集(这里使用 dask_ml 提供的模拟数据)
X, y = make_classification(n_samples=100000, n_features=20, random_state=42)
# 将数据集分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 初始化逻辑回归模型
model = LogisticRegression(max_iter=100)
# 在分布式集群上训练模型
with client:
model.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = model.predict(X_test)
# 计算预测准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"分布式逻辑回归模型的准确率:{accuracy}")
相关文章:
Dask 分布式集群部署 (物理机和 k8s) 及实战
Python初学者指南:掌握dask-ml库
污点和容忍度
为者常成,行者常至
自由转载-非商用-非衍生-保持署名(创意共享3.0许可证)