diff --git a/distribute_tasks.py b/distribute_tasks.py index eeeaca2..523f88c 100644 --- a/distribute_tasks.py +++ b/distribute_tasks.py @@ -4,6 +4,7 @@ import threading import queue import sys import time +import random import yaml import paramiko @@ -67,7 +68,20 @@ class WorkingThread(threading.Thread): def run(self): self.client = paramiko.client.SSHClient() self.client.load_system_host_keys() - self.client.connect(self.host, username=CONFIG["username"]) + for n_try in range(3): + try: + self.client.connect(self.host, username=CONFIG["username"]) + break + except Exception as exn: + delay = 3 + random.random() * 4 + print( + ( + "[{}] Failed to connect. Retry in {} seconds." + + "Exception:\n{}" + ).format(self.host, delay, exn), + file=sys.stderr, + ) + time.sleep(delay) try: while True: @@ -176,7 +190,6 @@ class Orchestrator: def start(self): for thread in self.threads: thread.start() - time.sleep(0.1) for thread in self.threads: while thread.is_alive():