-
Notifications
You must be signed in to change notification settings - Fork 385
Enrich failure handling #1065
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Enrich failure handling #1065
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,8 +17,10 @@ limitations under the License. | |||||
package utils | ||||||
|
||||||
import ( | ||||||
"bytes" | ||||||
"context" | ||||||
"fmt" | ||||||
"io" | ||||||
"time" | ||||||
|
||||||
corev1 "k8s.io/api/core/v1" | ||||||
|
@@ -85,11 +87,11 @@ func DeletePod(clientSet *kubernetes.Clientset, namespace, podName string) error | |||||
|
||||||
func WaitForPodRunning(clientSet kubernetes.Interface, namespace, podName string) error { | ||||||
const ( | ||||||
checkInterval = 5 * time.Second // Interval for checking Pod status | ||||||
timeout = 5 * time.Minute // Increased timeout for GPU Pods | ||||||
checkInterval = 30 * time.Second // Interval for checking Pod status | ||||||
timeout = 5 * time.Minute // Increased timeout for GPU Pods | ||||||
) | ||||||
|
||||||
return wait.PollUntilContextTimeout(context.TODO(), checkInterval, timeout, true, func(context.Context) (bool, error) { | ||||||
return wait.PollUntilContextTimeout(context.TODO(), checkInterval, timeout, true, func(ctx context.Context) (bool, error) { | ||||||
// Fetch the Pod object from the Kubernetes API | ||||||
pod, err := clientSet.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{}) | ||||||
if err != nil { | ||||||
|
@@ -123,3 +125,80 @@ func WaitForPodRunning(clientSet kubernetes.Interface, namespace, podName string | |||||
return false, nil | ||||||
}) | ||||||
} | ||||||
|
||||||
func GetNamespaceList(clientSet *kubernetes.Clientset) ([]string, error) { | ||||||
namespaces, err := clientSet.CoreV1().Namespaces().List(context.TODO(), metav1.ListOptions{}) | ||||||
if err != nil { | ||||||
klog.Errorf("Failed to list namespaces: %v", err) | ||||||
return nil, err | ||||||
} | ||||||
|
||||||
var nsList []string | ||||||
for _, ns := range namespaces.Items { | ||||||
nsList = append(nsList, ns.Name) | ||||||
} | ||||||
|
||||||
return nsList, err | ||||||
} | ||||||
|
||||||
func GetPodLogs(clientSet *kubernetes.Clientset, namespace, podName string) (string, error) { | ||||||
req := clientSet.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}) | ||||||
podLogs, err := req.Stream(context.TODO()) | ||||||
if err != nil { | ||||||
return "", err | ||||||
} | ||||||
defer podLogs.Close() | ||||||
buf := new(bytes.Buffer) | ||||||
if _, err = io.Copy(buf, podLogs); err != nil { | ||||||
return "", err | ||||||
} | ||||||
return buf.String(), nil | ||||||
} | ||||||
|
||||||
func CheckPodDetails(clientSet *kubernetes.Clientset) { | ||||||
namespaces, err := GetNamespaceList(clientSet) | ||||||
if err != nil { | ||||||
klog.Errorf("Failed to get namespaces: %v", err) | ||||||
return | ||||||
} | ||||||
|
||||||
for _, ns := range namespaces { | ||||||
pods, err := GetPods(clientSet, ns) | ||||||
if err != nil { | ||||||
klog.Errorf("Failed to get pods in namespace %s: %v", ns, err) | ||||||
continue | ||||||
} | ||||||
|
||||||
for _, pod := range pods.Items { | ||||||
status := pod.Status.Phase | ||||||
|
||||||
if status == corev1.PodRunning || status == corev1.PodSucceeded { | ||||||
continue | ||||||
} | ||||||
|
||||||
klog.Infof("Pod %s/%s is in %s status", ns, pod.Name, status) | ||||||
|
||||||
klog.Infof("Show events for %s/%s:", ns, pod.Name) | ||||||
events, err := GetPodEvents(clientSet, ns, pod.Name) | ||||||
if err != nil { | ||||||
klog.Errorf("Failed to get events for %s/%s: %v", ns, pod.Name, err) | ||||||
return | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Returning here stops logging details for other pods. Consider using
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
} | ||||||
|
||||||
if len(events) > 0 { | ||||||
for _, event := range events { | ||||||
klog.Infof("Reason: %s, Message: %s \n", event.Reason, event.Message) | ||||||
} | ||||||
} | ||||||
|
||||||
logs, err := GetPodLogs(clientSet, ns, pod.Name) | ||||||
if err != nil { | ||||||
klog.Errorf("Failed to get logs for %s/%s: %v", ns, pod.Name, err) | ||||||
return | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As with events, use
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
} | ||||||
|
||||||
klog.Infof("Show logs for %s/%s:", ns, pod.Name) | ||||||
klog.Infof(logs) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nitpick] Passing raw logs to
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
} | ||||||
} | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Add a doc comment to describe the purpose and behavior of this public function for better maintainability.
Copilot uses AI. Check for mistakes.