K8S 离线部署 Dask 分布式集群及 JupyterHub

k8s 公网部署Dask 及 jupyterHub请看上篇文章:Dask 分布式集群部署 (物理机和 k8s) 及实战

环境说明

  • k8s version ==v1.20.5
  • dask-kubernetes-operator==2024.5.0
  • jupyterHub==2.0.0(python-version=?)

一、部署dask-operator

离线安装

# 离线安装dask-operator

helm repo add dask https://helm.dask.org
helm repo update

mkdir /work/helm-charts
cd /work/helm-charts

# 下载为离线的dask-kubernetes-operator
helm pull dask/dask-kubernetes-operator
ls -l
dask-kubernetes-operator-2024.5.0.tgz

# 解压缩
tar -zxvf dask-kubernetes-operator-2024.5.0.tgz

# 拉取镜像
docker pull ghcr.io/dask/dask-kubernetes-operator:2024.5.0

# 打tag
docker tag  ghcr.io/dask/dask-kubernetes-operator:2024.5.0    baidu-harbor.baidu.com/deeplearning/dask-kubernetes-operator:2024.5.0

# 推送镜像
docker push baidu-harbor.baidu.com/deeplearning/dask-kubernetes-operator:2024.5.0

# 重新安装 
helm install dask-kubernetes-operator  ./dask-kubernetes-operator --create-namespace -n dask-operator

# 查看操作器是否安装成功
kubectl get pods -n dask-operator

[root@192.168.1.101 work]#kubectl get pods -n dask-operator
NAME                                        READY   STATUS    RESTARTS   AGE
dask-kubernetes-operator-686bfb958f-kscjf   1/1     Running   0          2m18s

二、部署jupyterhub

1、添加jupyter 仓库

helm repo add jupyterhub https://hub.jupyter.org/helm-chart/
helm repo update

2、下载helm-chart 下载到本地

# 查看jupyter版本 https://hub.jupyter.org/helm-chart/

# 指定版本, 3.3.7 需要k8s version >=1.23.0-0
# helm pull jupyterhub/jupyterhub --version 3.3.7
# 当前(2024.06.23)默认拉取的就是最新版本 3.3.7

# jupyterhub 2.0.0, k8s version >= 1.20.0 
# 将 helm chart 包下载下来,然后 cat Chart.yaml  可查看相关依赖版本
helm pull jupyterhub/jupyterhub --version 2.0.0

[root@192.168.1.101 work]#ls -l
total 88
drwxr-xr-x 4 root root   108 Jun 22 18:11 dask-kubernetes-operator
-rw-r--r-- 1 root root 40432 Jun 22 17:58 dask-kubernetes-operator-2024.5.0.tgz
-rw-r--r-- 1 root root 45296 Jun 23 12:42 jupyterhub-2.0.0.tgz

3、解压


[root@192.168.1.101 work]# tar -zxvf jupyterhub-2.0.0.tgz

4、查看 jupyterhub依赖的镜像:

cd jupyterhub/

[root@192.168.1.101 jupyterhub]#cat Chart.yaml 
annotations:
  artifacthub.io/images: |
    - image: jupyterhub/configurable-http-proxy:4.5.3
      name: configurable-http-proxy
    - image: jupyterhub/k8s-hub:2.0.0
      name: k8s-hub
    - image: jupyterhub/k8s-image-awaiter:2.0.0
      name: k8s-image-awaiter
    - image: jupyterhub/k8s-network-tools:2.0.0
      name: k8s-network-tools
    - image: jupyterhub/k8s-secret-sync:2.0.0
      name: k8s-secret-sync
    - image: jupyterhub/k8s-singleuser-sample:2.0.0
      name: k8s-singleuser-sample
    - image: k8s.gcr.io/kube-scheduler:v1.23.10
      name: kube-scheduler
    - image: k8s.gcr.io/pause:3.8
      name: pause
    - image: k8s.gcr.io/pause:3.8
      name: pause
    - image: traefik:v2.8.4
      name: traefik
apiVersion: v2
appVersion: 3.0.0
description: Multi-user Jupyter installation
home: https://z2jh.jupyter.org
icon: https://jupyterhub.github.io/helm-chart/images/hublogo.svg
keywords:
- jupyter
- jupyterhub
- z2jh
kubeVersion: '>=1.20.0-0'
maintainers:
- email: erik@sundellopensource.se
  name: Erik Sundell
- name: Simon Li
  url: https://github.com/manics/
name: jupyterhub
sources:
- https://github.com/jupyterhub/zero-to-jupyterhub-k8s
version: 2.0.0
[root@192.168.1.101 jupyterhub]#

手动下载镜像到本地:
添加国内镜像库:

vim  /etc/docker/daemon.json

{
    "registry-mirrors" : [
    "https://registry.docker-cn.com",
    "http://hub-mirror.c.163.com",
    "https://docker.mirrors.ustc.edu.cn",
    "https://cr.console.aliyun.com",
    "https://mirror.ccs.tencentyun.com"
  ]
}

重启:

systemctl daemon-reload
systemctl restart docker.service

拉取镜像:

# docker pull  jupyterhub/configurable-http-proxy:4.5.3
docker pull quay.io/jupyterhub/configurable-http-proxy:4.5.3  (ok)

# docker pull   jupyterhub/k8s-hub:2.0.0
docker pull quay.io/jupyterhub/k8s-hub:2.0.0  (ok)

docker pull  jupyterhub/k8s-image-awaiter:2.0.0 (ok)
docker pull  jupyterhub/k8s-network-tools:2.0.0 (ok)

# docker pull  jupyterhub/k8s-secret-sync:2.0.0
docker pull quay.io/jupyterhub/k8s-secret-sync:2.0.0  (ok)

# docker pull   jupyterhub/k8s-singleuser-sample:2.0.0
# docker pull quay.io/jupyterhub/k8s-singleuser-sample:2.0.0  (x)
   docker pull jupyterhub/k8s-singleuser-sample:2.0.0 (mac, ok)

# docker pull k8s.gcr.io/kube-scheduler:v1.23.10 (mac, ok)
docker pull k8s.gcr.io/kube-scheduler:v1.20.15 (mac, ok)

docker pull k8s.gcr.io/pause:3.8 (mac, ok)
docker pull traefik:v2.8.4 (mac, ok)

将镜像保存:

# 保存
docker save > k8s-singleuser-sample-2.0.0.tar  jupyterhub/k8s-singleuser-sample:2.0.0
# docker save > kube-scheduler-v1.23.10.tar k8s.gcr.io/kube-scheduler:v1.23.10
docker save > kube-scheduler-v1.20.15.tar k8s.gcr.io/kube-scheduler:v1.20.15
docker save > pause-3.8.tar k8s.gcr.io/pause:3.8
docker save > traefik-v2.8.4.tar  traefik:v2.8.4

# 压缩
tar -czvf  jupyterhub-yilai.tar.gz ./images

# 导入
docker load  < k8s-singleuser-sample-2.0.0.tar
docker load  < kube-scheduler-v1.20.15.tar
docker load  < pause-3.8.tar
docker load  < traefik-v2.8.4.tar

打tag,然后推送本地仓库:

docker tag  quay.io/jupyterhub/configurable-http-proxy:4.5.3    magic-harbor.magic.com/jupyterhub/configurable-http-proxy:4.5.3
docker push magic-harbor.magic.com/jupyterhub/configurable-http-proxy:4.5.3

docker tag  quay.io/jupyterhub/k8s-hub:2.0.0    magic-harbor.magic.com/jupyterhub/k8s-hub:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-hub:2.0.0

docker tag  jupyterhub/k8s-image-awaiter:2.0.0    magic-harbor.magic.com/jupyterhub/k8s-image-awaiter:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-image-awaiter:2.0.0

docker tag jupyterhub/k8s-network-tools:2.0.0    magic-harbor.magic.com/jupyterhub/k8s-network-tools:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-network-tools:2.0.0

docker tag quay.io/jupyterhub/k8s-secret-sync:2.0.0    magic-harbor.magic.com/jupyterhub/k8s-secret-sync:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-secret-sync:2.0.0

---
docker tag jupyterhub/k8s-singleuser-sample:2.0.0    magic-harbor.magic.com/jupyterhub/k8s-singleuser-sample:2.0.0
docker push magic-harbor.magic.com/jupyterhub/k8s-singleuser-sample:2.0.0

docker tag k8s.gcr.io/kube-scheduler:v1.20.15    magic-harbor.magic.com/jupyterhub/kube-scheduler:v1.20.15
docker push magic-harbor.magic.com/jupyterhub/kube-scheduler:v1.20.15

docker tag k8s.gcr.io/pause:3.8    magic-harbor.magic.com/jupyterhub/pause:3.8
docker push magic-harbor.magic.com/jupyterhub/pause:3.8

docker tag traefik:v2.8.4    magic-harbor.magic.com/jupyterhub/traefik:v2.8.4
docker push magic-harbor.magic.com/jupyterhub/traefik:v2.8.4

docker pull ghcr.io/dask/dask:latest
docker tag ghcr.io/dask/dask:latest    magic-harbor.magic.com/jupyterhub/dask:latest
docker push magic-harbor.magic.com/jupyterhub/dask:latest

修改配置文件的镜像地址:

# 重新安装 
# helm install dask-kubernetes-operator  ./dask-kubernetes-operator --create-namespace -n dask-operator

# 创建jhub命名空间
kubectl create ns jhub
cd /work
---

 helm upgrade --cleanup-on-fail \
 --install jupyterhub-release ./jupyterhub \
 --namespace jhub \
 --values config.yaml

config.yaml 文件

hub:
  config:
    Authenticator:
      admin_users:                          # jupyterhub 的管理员用户
      - admin
      allowed_users:                        # 不配置的话,允许用户注册
      - kaiyi
    DummyAuthenticator:
      password: hub123                      #配置通用密码(基本没什么安全性)
    JupyterHub:
      admin_access: true                      #配置是否允许管理者账户存在
      authenticator_class: dummy              #指定所有账户授权类型(默认是傻瓜式)

部署结果:

[root@192.168.1.101 work]# helm upgrade --cleanup-on-fail \
>  --install jupyterhub-release ./jupyterhub \
>  --namespace jhub \
>  --values config.yaml
Release "jupyterhub-release" has been upgraded. Happy Helming!
NAME: jupyterhub-release
LAST DEPLOYED: Wed Jun 26 02:04:05 2024
NAMESPACE: jhub
STATUS: deployed
REVISION: 2
TEST SUITE: None
NOTES:
.      __                          __                  __  __          __
      / / __  __  ____    __  __  / /_  ___    _____  / / / / __  __  / /_
 __  / / / / / / / __ \  / / / / / __/ / _ \  / ___/ / /_/ / / / / / / __ \
/ /_/ / / /_/ / / /_/ / / /_/ / / /_  /  __/ / /    / __  / / /_/ / / /_/ /
\____/  \__,_/ / .___/  \__, /  \__/  \___/ /_/    /_/ /_/  \__,_/ /_.___/
              /_/      /____/

       You have successfully installed the official JupyterHub Helm chart!

### Installation info

  - Kubernetes namespace: jhub
  - Helm release name:    jupyterhub-release
  - Helm chart version:   2.0.0
  - JupyterHub version:   3.0.0
[root@192.168.1.101 work]#

修改 Jupyterhub svc 的服务,将 改为 NodePort

[root@192.168.1.101 images]#kubectl get svc -n jhub
NAME           TYPE           CLUSTER-IP      EXTERNAL-IP   PORT(S)        AGE
hub            ClusterIP      10.68.203.41    <none>        8081/TCP       19m
proxy-api      ClusterIP      10.68.61.94     <none>        8001/TCP       19m
proxy-public   LoadBalancer   10.68.119.149   <pending>     80:32363/TCP   19m
[root@192.168.1.101 images]#kubectl edit svc proxy-public -n jhub
service/proxy-public edited
[root@192.168.1.101 images]#

http://192.168.1.101:32363/hub/login?next=%2Fhub%2F
通过 admin/hub123 登录 Jupyterhub

[root@192.168.1.101 images]#kubectl get pods -n jhub
NAME                             READY   STATUS    RESTARTS   AGE
continuous-image-puller-9tzj7    1/1     Running   0          23m
continuous-image-puller-g6kf5    1/1     Running   0          23m
continuous-image-puller-jk6fx    1/1     Running   0          23m
continuous-image-puller-n4bn8    1/1     Running   0          23m
continuous-image-puller-qwqvt    1/1     Running   0          23m
continuous-image-puller-zqbm7    1/1     Running   0          23m
hub-6d4978fbfb-hnv57             1/1     Running   0          23m
jupyter-admin                    1/1     Running   0          49s
proxy-77f44ff6b7-6p7mv           1/1     Running   0          23m
user-scheduler-65b67b4c9-bmmxc   1/1     Running   0          4m54s
user-scheduler-65b67b4c9-wkp2d   1/1     Running   0          5m28s
[root@192.168.1.101 images]#

安装依赖

在jupyterhub 中安装 dask-kubernetes 依赖:

# pip install dask-kubernetes==2023.1.0
!pip install dask-kubernetes==2023.1.0

file

查看安装的依赖:

$ pip list | grep dask
dask                 2024.6.2
dask-kubernetes      2023.1.0

测试:

# 是否能启动 dask_kubernetes 服务
from dask_kubernetes.operator import KubeCluster
cluster = KubeCluster(name="my-dask-cluster", image='ghcr.io/dask/dask:latest')
# cluster.scale(10)
cluster.scale(3)

file

问题处理

# 查看节点标签
kubectl get node 192.168.1.101 --show-labels

# 查看污点
[root@192.168.1.101 notebook]#kubectl  describe nodes 192.168.1.101  | grep Taints
Taints:             gpu=gpu:NoSchedule

# 节点选择
  nodeSelector:   # 节点label选择
        gpucore: gpu
   tolerations: # 容忍度,不允许其他pod调度该节点,只允许配置该参数的;
        - key: gpu
          operator: Equal
          value: gpu
          effect: NoSchedule

修改 jupyterhub/values.yaml,指定 jupyter-admin pod调度到可以访问公网的节点,增加 nodeSelector 和容忍度:

# singleuser relates to the configuration of KubeSpawner which runs in the 
hub
# pod, and its spawning of user pods such as jupyter-myusername.
singleuser:
  podNameTemplate:
  extraTolerations:
    - key: gpu
      operator: Equal
      value: gpu
      effect: NoSchedule
  nodeSelector:
    gpucore: gpu

file

删除节点污点:

kubectl taint nodes 192.168.1.101 key1-

查看pod是否调度到 101节点:

[root@192.168.1.101 work]#kubectl get pods -n jhub -o wide
NAME                             READY   STATUS    RESTARTS   AGE   IP               NODE            NOMINATED NODE   READINESS GATEScontinuous-image-puller-psdd8    1/1     Running   0          15m   172.20.160.144   192.168.1.101   <none>           <none>
hub-7fdfb4d57c-sbckk             1/1     Running   0          23m   172.20.193.36    192.168.1.35    <none>           <none>
jupyter-admin                    1/1     Running   0          15m   172.20.160.145   192.168.1.101   <none>           <none>
proxy-77f44ff6b7-6p7mv           1/1     Running   0          9h    172.20.193.38    192.168.1.35    <none>           <none>
user-scheduler-65b67b4c9-bmmxc   1/1     Running   0          9h    172.20.193.46    192.168.1.35    <none>           <none>
user-scheduler-65b67b4c9-wkp2d   1/1     Running   0          9h    172.20.193.6     192.168.1.35    <none>           <none>
[root@192.168.1.101 work]#

自定义资源

cluster.yaml

# cluster.yaml
apiVersion: kubernetes.dask.org/v1
kind: DaskCluster
metadata:
  name: simple
    namespace: jhub
spec:
  worker:
    replicas: 2
    spec:
      containers:
      - name: worker
        image: "magic-harbor.magic.com/jupyterhub/dask:latest"
        imagePullPolicy: "IfNotPresent"
        args:
          - dask-worker
          - --name
          - $(DASK_WORKER_NAME)
          - --dashboard
          - --dashboard-address
          - "8788"
        ports:
          - name: http-dashboard
            containerPort: 8788
            protocol: TCP
  scheduler:
    spec:
      containers:
      - name: scheduler
        image: "magic-harbor.magic.com/jupyterhub/dask:latest"
        imagePullPolicy: "IfNotPresent"
        args:
          - dask-scheduler
        ports:
          - name: tcp-comm
            containerPort: 8786
            protocol: TCP
          - name: http-dashboard
            containerPort: 8787
            protocol: TCP
        readinessProbe:
          httpGet:
            port: http-dashboard
            path: /health
          initialDelaySeconds: 5
          periodSeconds: 10
        livenessProbe:
          httpGet:
            port: http-dashboard
            path: /health
          initialDelaySeconds: 15
          periodSeconds: 20
    service:
      type: NodePort
      selector:
        dask.org/cluster-name: simple
        dask.org/component: scheduler
      ports:
      - name: tcp-comm
        protocol: TCP
        port: 8786
        targetPort: "tcp-comm"
      - name: http-dashboard
        protocol: TCP
        port: 8787
        targetPort: "http-dashboard"
[root@192.168.1.101 work]#kubectl get pods -n jhub
NAME                                                READY   STATUS    RESTARTS   AGE
continuous-image-puller-psdd8                       1/1     Running   0          13h
hub-7fdfb4d57c-sbckk                                1/1     Running   0          13h
jupyter-admin                                       1/1     Running   0          62m
proxy-77f44ff6b7-6p7mv                              1/1     Running   0          23h
simple-default-worker-593ece0aaf-68c64cb45d-cdgpz   1/1     Running   0          2m49s
simple-default-worker-6967e21d23-987cdf69d-gqbdq    1/1     Running   0          2m49s
simple-scheduler-6d98bb4684-v9rhk                   1/1     Running   0          2m49s
user-scheduler-65b67b4c9-bmmxc                      1/1     Running   0          23h
user-scheduler-65b67b4c9-wkp2d                      1/1     Running   0          23h
[root@192.168.1.101 work]#

[root@192.168.1.101 work]#kubectl get svc -n jhub
NAME               TYPE           CLUSTER-IP      EXTERNAL-IP   PORT(S)                         AGE
hub                ClusterIP      10.68.203.41    <none>        8081/TCP                        23h
proxy-api          ClusterIP      10.68.61.94     <none>        8001/TCP                        23h
proxy-public       LoadBalancer   10.68.119.149   <pending>     80:32363/TCP                    23h
simple-scheduler   NodePort       10.68.47.69     <none>        8786:31026/TCP,8787:31135/TCP   93s

内部域名:

simple-scheduler.jhub.svc.cluster.local
10.233.57.184:8786

from dask.distributed import Client
client = Client("10.68.47.69:8786")

修改singleuser 网络策略

由于默认 jupyterhub 有开启网络策略 NetworkPolicy,所以在 jupyter中调用dask pod 服务 10.68.47.69:8786 会报错,所以,需要关闭 网络策略:

cd /work/jupyterhub
vim values.yaml

----

# pod, and its spawning of user pods such as jupyter-myusername.
singleuser:
  podNameTemplate:
  extraTolerations:
    - key: gpu
      operator: Equal
      value: gpu
      effect: NoSchedule
  nodeSelector:
    gpucore: gpu
  extraNodeAffinity:
    required: []
    preferred: []
...
 networkPolicy:
    enabled: false   # 修改此处,将默认的 true改为 false
    ingress: []
    egress: []
    egressAllowRules:
      cloudMetadataServer: false
      dnsPortsPrivateIPs: true
      nonPrivateIPs: true
      privateIPs: false
    interNamespaceAccessLabels: ignore
    allowedIngressPorts: []

修改之后然后重新部署jupyterhub:

 helm upgrade --cleanup-on-fail \
 --install jupyterhub-release ./jupyterhub \
 --namespace jhub \
 --values config.yaml

然后在jupyter 中运行分布式dask:

from dask.distributed import Client
# 使用dask svc域名访问
client = Client("simple-scheduler.jhub.svc.cluster.local:8786")
# client = Client("10.68.47.69:8786")
print(client)

测试:

def square(x):
    return x ** 2

def neg(x):
    return -x

A = client.map(square, range(10))
B = client.map(neg, A)
total = client.submit(sum, B)

total.result()

file

分布式建模

https://examples.dask.org/machine-learning/xgboost.html

安装 dask_ml 依赖:

pip install dask_ml
# pip install dask-ml==2023.3.24
# !pip list | grep dask-ml 

机器学习模型:

from dask.distributed import Client
from dask_ml.datasets import make_classification
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
from dask_ml.metrics import accuracy_score

# 连接到 Dask 分布式集群
client = Client("simple-scheduler.jhub.svc.cluster.local:8786")
print(client)

# 生成示例数据集(这里使用 dask_ml 提供的模拟数据)
X, y = make_classification(n_samples=100000, n_features=20, random_state=42)

# 将数据集分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 初始化逻辑回归模型
model = LogisticRegression(max_iter=100)

# 在分布式集群上训练模型
with client:
    model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

# 计算预测准确率
accuracy = accuracy_score(y_test, y_pred)

print(f"分布式逻辑回归模型的准确率:{accuracy}")

相关文章:
Dask 分布式集群部署 (物理机和 k8s) 及实战
Python初学者指南:掌握dask-ml库
污点和容忍度

为者常成,行者常至