安装helm3包管理工具
客户端安装
1
2
3
|
wget https://get.helm.sh/helm-v3.2.0-linux-amd64.tar.gz
tar -xf helm-v3.2.0-linux-amd64.tar.gz
cp linux-amd64/helm /usr/local/bin/
|
查看配置信息
1
2
3
4
5
6
7
8
9
10
11
|
[root@kube-mas ~]# helm env
HELM_BIN="helm"
HELM_DEBUG="false"
HELM_KUBEAPISERVER=""
HELM_KUBECONTEXT=""
HELM_KUBETOKEN=""
HELM_NAMESPACE="default"
HELM_PLUGINS="/root/.local/share/helm/plugins"
HELM_REGISTRY_CONFIG="/root/.config/helm/registry.json"
HELM_REPOSITORY_CACHE="/root/.cache/helm/repository"
HELM_REPOSITORY_CONFIG="/root/.config/helm/repositories.yaml"
|
添加公用的仓库
1
2
|
[root@kube-mas ~]# helm repo add aliyuncs https://apphub.aliyuncs.com
[root@kube-mas ~]# helm repo update
|
搜索prometheus-operator
1
2
3
4
|
[root@kube-mas ~]# helm search repo prometheus-operator
NAME CHART VERSION APP VERSION DESCRIPTION
aliyuncs/prometheus-operator 8.7.0 0.35.0 Provides easy monitoring definitions for Kubern...
|
安装prometheus-operator
下载chart包
1
2
3
|
[root@kube-mas ~]# helm pull aliyuncs/prometheus-operator
[root@kube-mas ~]# tar -xf prometheus-operator-8.7.0.tgz -C /opt/
|
k8s创建对应的名称空间
1
|
[root@kube-mas ~]# kubectl create ns mon
|
修改value.yaml文件
创建etcd需要的secret
1
2
3
4
|
[root@kube-mas ~]# cd /etc/kubernetes/pki/etcd/
[root@kube-mas etcd]# kubectl create secret generic etcd-cert --from-file=ca.crt --from-file=ca.key --from-file=server.crt --from-file=server.key -n mon
[root@kube-mas ~]# cd /opt/prometheus-operator/
[root@kube-mas prometheus-operator]# vim values.yaml
|
配置kubeetcd信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
743 kubeEtcd:
744 enabled: true
745
746 ## If your etcd is not deployed as a pod, specify IPs it can be found on
747 ##
748 endpoints: []
749 # - 10.141.4.22
750 # - 10.141.4.23
751 # - 10.141.4.24
752
753 ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used
754 ##
755 service:
756 port: 2379
757 targetPort: 2379
758 # selector:
759 # component: etcd
760
761 ## Configure secure access to the etcd cluster by loading a secret into prometheus and
762 ## specifying security configuration below. For example, with a secret named etcd-client-cert
763 ##
764 ## serviceMonitor:
765 ## scheme: https
766 ## insecureSkipVerify: false
767 ## serverName: localhost
768 ## caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca
769 ## certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client
770 ## keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
771 ##
772 serviceMonitor:
773 ## Scrape interval. If not set, the Prometheus default scrape interval is used.
774 ##
775 interval: ""
776 scheme: https
777 insecureSkipVerify: true
778 serverName: ""
779 caFile: "/etc/prometheus/secrets/ca.crt"
780 certFile: "/etc/prometheus/secrets/server.crt"
781 keyFile: "/etc/prometheus/secrets/server.key"
....
1475 secrets: ["etcd-cert"]
|
k8s集群修改kube-proxy metricsBindAddress的地址
1
2
3
4
|
[root@kube-mas prometheus-operator]# kubectl edit cm kube-proxy -n kube-system
kind: KubeProxyConfiguration
metricsBindAddress: "0.0.0.0:10249"
[root@kube-mas prometheus-operator]# kubectl get po -n kube-system | awk '/kube-proxy/{print "kubectl delete po -n kube-system "$1}' | sh
|
需要注意的是,这里没有使用pv,如果需要持久化存储,需要配置alertmanager以及prometheus storage下面pvc信息
如果alertmanager需要自定制模板的话,需要创建configmap,然后再alertmanager的configMaps里添加上
例如:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
[root@kube-mas prometheus-operator]# cat alertmanager-cm.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: wechat-tmpl
namespace: mon
data:
wechat.tmpl: |
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
故障
告警类型: {{ .Labels.alertname }}
告警级别: {{ .Labels.severity }}
=====================
===告警详情===
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
故障时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
===参考信息===<br>
{{- if gt (len .Labels.instance) 0 -}}
故障实例ip: {{ .Labels.instance }}
{{ end -}}
{{- if gt (len .Labels.namespace) 0 -}}
故障实例所在namespace: {{ .Labels.namespace }}
{{ end -}}
{{- if gt (len .Labels.node) 0 -}}
故障物理机ip: {{ .Labels.node }}
{{ end -}}
{{- if gt (len .Labels.pod) 0 -}}
故障pod名称: {{ .Labels.pod }}
{{ end -}}
=====================
{{ end }}
{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
故障恢复
告警类型: {{ .Labels.alertname }}
告警级别: {{ .Labels.severity }}
=====================
===告警详情===
告警详情: {{ .Annotations.message }}
故障时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
===参考信息===<br>
{{- if gt (len .Labels.instance) 0 -}}
故障实例ip: {{ .Labels.instance }}
{{ end -}}
{{- if gt (len .Labels.namespace) 0 -}}
故障实例所在namespace: {{ .Labels.namespace }}
{{ end -}}
{{- if gt (len .Labels.node) 0 -}}
故障物理机ip: {{ .Labels.node }}
{{ end -}}
{{- if gt (len .Labels.pod) 0 -}}
故障pod名称: {{ .Labels.pod }}
{{ end -}}
=====================
{{ end }}
{{ end -}}
{{- end }}
[root@kube-mas prometheus-operator]# kubectl apply -f alertmanager-cm.yaml
|
配置alertmanager信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
109 config:
110 global:
111 resolve_timeout: 5m
112 wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
113 wechat_api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxx'
114 wechat_api_corp_id: 'xxxxxxxxxxxxxxxxxxxxxxxx'
115 templates:
116 - '/etc/alertmanager/configmaps/wechat-tmpl/*.tmpl'
117 route:
118 group_by: ['job']
119 group_wait: 30s
120 group_interval: 5m
121 repeat_interval: 12h
122 receiver: 'wechat'
123 receivers:
124 - name: 'wechat'
125 wechat_configs:
126 - send_resolved: true
127 corp_id: 'xxxxxxxxxxxxxxxxxx'
128 to_user: '@all'
129 message: '{{ template "wechat.default.message" . }}'
130 agent_id: '1000010'
131 api_secret: 'xxxxxxxxxxxxxxxxxxxxxx'
132 inhibit_rules:
133 - source_match:
134 severity: 'critical'
135 target_match:
136 severity: 'warning'
137 equal: ['alertname', 'dev', 'instance']
....
278 configMaps: ["wechat-tmpl"]
|
安装
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
[root@kube-mas prometheus-operator]# helm install promethus-operator --namespace=mon .
如果出现manifest_sorter.go:192: info: skipping unknown hook: "crd-install"
可以忽略
列出所安装的release
列出所有名称空间
[root@kube-mas prometheus-operator]# helm list -A
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
promethus-operator mon 1 2021-08-06 12:13:03.616675919 +0800 CST deployed prometheus-operator-8.7.0 0.35.0
列出某个名称空间
[root@kube-mas prometheus-operator]# helm list -n mon
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
promethus-operator mon 1 2021-08-06 12:13:03.616675919 +0800 CST deployed prometheus-operator-8.7.0 0.35.0
|
将promethus 9200 集群端口类型改为nodeport端口30090
1
2
|
[root@master1 ~]# kubectl patch svc -n mon promethus-operator-prometh-prometheus -p '{"spec":{"type":"NodePort","ports":[{"name":"web","port":9090,"nodePort":30090}]}}'
service/promethus-operator-prometh-prometheus patched
|
将grafana 的80 端口映射到nodeport的30080
1
|
kubectl patch svc -n mon promethus-operator-grafana -p '{"spec":{"type":"NodePort","ports":[{"name":"service","port":80,"nodePort":30080}]}}'
|
将alertmanager 9093端口映射到nodeport的30093
1
|
kubectl patch svc -n mon promethus-operator-prometh-alertmanager -p '{"spec":{"type":"NodePort","ports":[{"name":"service","port":9093,"nodePort":30093}]}}'
|
1
2
3
4
5
6
7
|
查看grafana的账号密码
kubectl get secret promethus-operator-grafana -n mon -o=jsonpath={.data.admin-user} | base64 -d
admin
kubectl get secret promethus-operator-grafana -n mon -o=jsonpath='{.data.admin-password}' | base64 -d
prom-operator
|
卸载
1
|
[root@kube-mas prometheus-operator]# helm uninstall promethus-operator -n mon
|