control_mouse.py

import cv2
import random
import mediapipe as mp
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pyautogui
import time
from collections import Counter
from screeninfo import get_monitors

MONITOR = get_monitors()[0]
WIDTH, HEIGHT = MONITOR.width, MONITOR.height

class Mouse:
    def __init__(self) -> None:
        
        self.predictions = []
        self.previous_action = None
        self.freeze_action = False

        # parameters to fine-tune
        self.action_length = 5
        self.move_distance = 10
        self.scroll_distance = 10
        self.time_checking = 0.5

    def get_hand_pos(self, hand_pos):
        self.hand_pos_x = hand_pos[0]
        self.hand_pos_y = hand_pos[1]

    def add_prediction(self, prediction):
        self.predictions.append(prediction)
        if len(self.predictions) == self.action_length:
            self.make_action()

    def make_action(self):
        action = self.get_major_element(self.predictions)
        if self.freeze_action and action == self.previous_action:
            self.update_init(action)
        else:
            self.mouse_control(action)
            self.update_init(action)

    def update_init(self, action):
        self.predictions = []
        self.previous_action = action

        self.freeze_action = action in {"left click", "right click", "double click"} # maybe change to keyboard and drops

    def mouse_hand_parameters(self):
        pass

    def mouse_control(self, prediction):
        if prediction == "stop execution" or None:
            pass  # Stop movement
        elif prediction == "move cursor":
            
            #hand_point = ([int(self.hand_pos_x*WIDTH), int(self.hand_pos_y*HEIGHT)])
            hand_x = np.clip(int(self.hand_pos_x*WIDTH), 0, WIDTH-1)
            hand_y = np.clip(int(self.hand_pos_y*HEIGHT), 0, HEIGHT-1)
            pyautogui.moveTo(hand_x, hand_y)

        elif prediction == "stop moving":
            pyautogui.move(0, 0)  # Stop cursor
        elif prediction == "left click":
            pyautogui.click()  # Left click 
        elif prediction == "right click":
            pyautogui.click(button='right')  # Right click
        elif prediction == "double click":
            pyautogui.click(clicks=2)    # Double click
        elif prediction == "scrolling up":
            pyautogui.scroll(self.scroll_distance)  # Scroll up
        elif prediction == "scrolling down":
            pyautogui.scroll(-self.scroll_distance)  # Scroll down
        elif prediction == "scrolling right":
            pyautogui.hscroll(self.scroll_distance)    # Scroll right
            # THIS FUNCTION NOT WORKS ON WINDOWS
        elif prediction == "scrolling left":
            pyautogui.hscroll(self.scroll_distance)    # Scroll left
            # THIS FUNCTION NOT WORKS ON WINDOWS
        elif prediction == "drag":
            pyautogui.mouseDown()
            hand_x = np.clip(int(self.hand_pos_x*WIDTH), 0, WIDTH-1)
            hand_y = np.clip(int(self.hand_pos_y*HEIGHT), 0, HEIGHT-1)
            pyautogui.moveTo(hand_x, hand_y)
        elif prediction == "drop":
            pyautogui.mouseUp()
        elif prediction == "multiple item selection grab":
            pyautogui.mouseDown()
        elif prediction == "multiple item selection drop":
            pyautogui.mouseUp()
        elif prediction == "change to keyboard":
            pass
        
        #time.sleep(self.time_checking)  # Adjust speed of movement
    
    def get_major_element(self, string_list):
        counts = Counter(string_list)
        # Find the element with the maximum count
        major_element, _ = counts.most_common(1)[0]
        
        return major_element


def normalise_landmarks(landmark_list):
    if len(landmark_list) == 0:
        return landmark_list
    
    x = [lm[0] for lm in landmark_list]
    y = [lm[1] for lm in landmark_list]

    min_x = min(x)
    max_x = max(x)
    min_y = min(y)
    max_y = max(y)
    
    normalised_landmarks = []
    for lm in landmark_list:
        x_norm = (lm[0] - min_x) / (max_x - min_x)
        y_norm = (lm[1] - min_y) / (max_y - min_y)
        lm_norm = (x_norm, y_norm)
        
        normalised_landmarks.append(lm_norm)
    
    return normalised_landmarks    

## main: open video and do hand detection
def main():
    #define Mouse
    mouse = Mouse()

    # load model
    model_dict = pickle.load(open('./trained_Moni_data.p', 'rb'))
    model = model_dict['model']
    
    # create hand detection object
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    
    # open video
    cap = cv2.VideoCapture(0)
    
    # if cannot open video give warning
    if not cap.isOpened():
        print("Warning: cannot reach camera")
    else:
        print("Program is running, push 'q' to quit.")
        
    # mediapipe hand object
    with mp_hands.Hands( max_num_hands=1, model_complexity=1,
                        min_detection_confidence=0.9, min_tracking_confidence=0.9) as hands:
        
        # read frames from webcamera
        while cap.isOpened():        
            ret, frame = cap.read()
            
            if not ret:
                print("Warning: cannot read camera input")
                break
                
            # flip frame to appear as a mirror
            frame = cv2.flip(frame, 1)
            frameRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            ## hand detection
            results = hands.process(frameRGB)
            
            landmark_list = []
            mouse_command = None
            if results.multi_hand_landmarks:
                # multi_hand_landmarks can store two hands, if max_num_hands=2, in which case we have to iterate through the hands with
                # for num, hand in enumerate(results.multi_hand_landmarks): 
                
                # one hand is detected, because max_num_hands=1
                hand_landmarks = results.multi_hand_landmarks[0]  

                # draw landmarks on frame
                mp_drawing.draw_landmarks(frameRGB, hand_landmarks, mp_hands.HAND_CONNECTIONS, 
                                            mp_drawing.DrawingSpec(color=(250, 0, 0), thickness=2, circle_radius=4),
                                            mp_drawing.DrawingSpec(color=(0, 250, 0), thickness=2, circle_radius=2),
                                            )
                
                # get landmark list with indices described in https://github.com/google-ai-edge/mediapipe/blob/master/mediapipe/python/solutions/hands.py
                for lm in hand_landmarks.landmark:
                    landmark_list.append((lm.x, lm.y))
            
                # normalise landmarks for mor powerful training
                normalised_landmark_list = normalise_landmarks(landmark_list)
            
                # apply model
                pred = model.predict(np.asarray(normalised_landmark_list).reshape(1, -1))
                mouse_command = pred[0]
                cv2.putText(img = frameRGB, text = pred[0], org = (30,30), 
                    fontFace = cv2.FONT_HERSHEY_DUPLEX, fontScale = 1, color = (255, 0, 0), thickness = 1)

                mouse.add_prediction(mouse_command)
                if mouse_command == "move cursor" or "grab":
                    mouse.get_hand_pos(landmark_list[8])
            # transform back RGB and show frame with annotation
            frame_annotated = cv2.cvtColor(frameRGB, cv2.COLOR_RGB2BGR)
            cv2.imshow('Hand tracking', frame_annotated)
            
            # or show original frame without annotation
            # cv2.imshow('Hand tracking', frame)
            
            # Check for key presses
            key = cv2.waitKey(1) & 0xFF
            
            if key == ord('n'):
                label = ""
            elif key == ord('q'):
                print("Quit camera")
                break

    cap.release()
    cv2.destroyAllWindows()
    
    print("Program closed")

if __name__ == '__main__':
    main()