diff --git a/final_project/main.py b/final_project/main.py index 9be031afa7a71fc4a96b084e886ac3d8e0f23cef..cde4c48271153e92ce8bcd0d5670787a03481380 100644 --- a/final_project/main.py +++ b/final_project/main.py @@ -6,12 +6,12 @@ import numpy as np from sklearn.ensemble import RandomForestClassifier import time import os +from tkinter import Tk, Label +from PIL import Image, ImageTk from mouse_class import Mouse from hand_detection import normalise_landmarks - -## main: open video and do hand detection def main(): #define Mouse mouse = Mouse() @@ -31,81 +31,99 @@ def main(): # if cannot open video give warning if not cap.isOpened(): - print("Warning: cannot reach camera") + print("Warning: Cannot reach camera") + return else: - print("Program is running, push 'q' to quit.") - + print("Program is running, press 'q' to quit.") + + # set up Tkinter window + root = Tk() + root.title("Hand Tracking - Always on Top") + root.attributes("-topmost", True) + video_label = Label(root) + video_label.pack() + + # adjust window geometry + # Get the screen width and height + screen_width = root.winfo_screenwidth() + screen_height = root.winfo_screenheight() + + # Define window size and position (e.g., 320x240 window at bottom-right corner) + window_width = 160 + window_height = 120 + x_position = screen_width - window_width - 10 # 10px margin from the right + y_position = screen_height - window_height - 70 # 50px margin from the bottom + + # Set window geometry + root.geometry(f"{window_width}x{window_height}+{x_position}+{y_position}") # mediapipe hand object - with mp_hands.Hands( max_num_hands=1, model_complexity=1, + with mp_hands.Hands(max_num_hands=1, model_complexity=1, min_detection_confidence=0.9, min_tracking_confidence=0.9) as hands: - # read frames from webcamera - while cap.isOpened(): + def update_frame(): ret, frame = cap.read() - if not ret: - print("Warning: cannot read camera input") - break - - # flip frame to appear as a mirror + print("Warning: Cannot read camera input") + root.destroy() + return + + # flip frame and process it frame = cv2.flip(frame, 1) frameRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - - ## hand detection + + # Hand detection results = hands.process(frameRGB) landmark_list = [] mouse_command = None if results.multi_hand_landmarks: - # multi_hand_landmarks can store two hands, if max_num_hands=2, in which case we have to iterate through the hands with - # for num, hand in enumerate(results.multi_hand_landmarks): - # one hand is detected, because max_num_hands=1 - hand_landmarks = results.multi_hand_landmarks[0] + hand_landmarks = results.multi_hand_landmarks[0] - # draw landmarks on frame - mp_drawing.draw_landmarks(frameRGB, hand_landmarks, mp_hands.HAND_CONNECTIONS, - mp_drawing.DrawingSpec(color=(250, 0, 0), thickness=2, circle_radius=4), - mp_drawing.DrawingSpec(color=(0, 250, 0), thickness=2, circle_radius=2), - ) + # Draw landmarks on frame + mp_drawing.draw_landmarks( + frameRGB, hand_landmarks, mp_hands.HAND_CONNECTIONS, + mp_drawing.DrawingSpec(color=(250, 0, 0), thickness=2, circle_radius=4), + mp_drawing.DrawingSpec(color=(0, 250, 0), thickness=2, circle_radius=2) + ) # get landmark list with indices described in https://github.com/google-ai-edge/mediapipe/blob/master/mediapipe/python/solutions/hands.py for lm in hand_landmarks.landmark: landmark_list.append((lm.x, lm.y)) - - # normalise landmarks for mor powerful training + + # normalise landmarks for more powerful training normalised_landmark_list = normalise_landmarks(landmark_list) - + # apply model pred = model.predict(np.asarray(normalised_landmark_list).reshape(1, -1)) mouse_command = pred[0] - cv2.putText(img = frameRGB, text = pred[0], org = (30,30), - fontFace = cv2.FONT_HERSHEY_DUPLEX, fontScale = 1, color = (255, 0, 0), thickness = 1) + cv2.putText( + img=frameRGB, text=pred[0], org=(30, 30), + fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=1, color=(255, 0, 0), thickness=1 + ) mouse.add_prediction(mouse_command) if mouse_command == "move cursor" or "grab": mouse.get_hand_pos(landmark_list[8]) - - # transform back RGB and show frame with annotation - frame_annotated = cv2.cvtColor(frameRGB, cv2.COLOR_RGB2BGR) - cv2.imshow('Hand tracking', frame_annotated) - - # or show original frame without annotation - # cv2.imshow('Hand tracking', frame) - # Check for key presses - key = cv2.waitKey(1) & 0xFF - - if key == ord('n'): - label = "" - elif key == ord('q'): - print("Quit camera") - break + # Convert frame to Tkinter-compatible format and display + frameRGB_resized = cv2.resize(frameRGB, (root.winfo_width(), root.winfo_height())) + img = ImageTk.PhotoImage(Image.fromarray(frameRGB_resized)) + video_label.config(image=img) + video_label.image = img + + # Refresh frame + root.after(10, update_frame) + + # Start updating frames + update_frame() + + # Quit the program properly + root.protocol("WM_DELETE_WINDOW", lambda: (cap.release(), root.destroy())) + root.mainloop() cap.release() - cv2.destroyAllWindows() - print("Program closed") if __name__ == '__main__': - main() \ No newline at end of file + main()