Improved classification performance

2024-05-15 05:32:49 +01:00
parent 516d1d7634
commit 652542d261
18 changed files with 211 additions and 98 deletions
--- a/logic/models/add.go
+++ b/logic/models/add.go
@@ -16,7 +16,6 @@ import (
 )

 func loadBaseImage(c *Context, id string) {
-	// TODO handle more types than png
 	infile, err := os.Open(path.Join("savedData", id, "baseimage.png"))
 	if err != nil {
 		c.Logger.Errorf("Failed to read image for model with id %s\n", id)
--- a/logic/models/run.go
+++ b/logic/models/run.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"os"
 	"path"
+	"runtime/debug"

 	. "git.andr3h3nriqu3s.com/andr3/fyp/logic/db_types"
 	. "git.andr3h3nriqu3s.com/andr3/fyp/logic/tasks/utils"
@@ -37,11 +38,19 @@ func ReadJPG(scope *op.Scope, imagePath string, channels int64) *image.Image {
 	return image.Scale(0, 255)
 }

-func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor) (order int, confidence float32, err error) {
+func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor, data *RunnerModelData) (order int, confidence float32, err error) {
 	order = 0
 	err = nil

-	tf_model := tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
+	var tf_model *tg.Model = nil
+
+	if data.Id != nil && *data.Id == def_id {
+		tf_model = data.Model
+	} else {
+		tf_model = tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
+		data.Model = tf_model
+		data.Id = &def_id
+	}

 	results := tf_model.Exec([]tf.Output{
 		tf_model.Op("StatefulPartitionedCall", 0),
@@ -125,10 +134,15 @@ func runModelExp(base BasePack, model *BaseModel, def_id string, inputImage *tf.
 	return
 }

-func ClassifyTask(base BasePack, task Task) (err error) {
+type RunnerModelData struct {
+	Id    *string
+	Model *tg.Model
+}
+
+func ClassifyTask(base BasePack, task Task, data *RunnerModelData) (err error) {
 	defer func() {
 		if r := recover(); r != nil {
-			base.GetLogger().Error("Task failed due to", "error", r)
+			base.GetLogger().Error("Task failed due to", "error", r, "stack", string(debug.Stack()))
 			task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Task failed running")
 		}
 	}()
@@ -186,6 +200,8 @@ func ClassifyTask(base BasePack, task Task) (err error) {

 	if model.ModelType == 2 {
 		base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
+		data.Model = nil
+		data.Id = nil
 		vi, confidence, err = runModelExp(base, model, def_id, inputImage)
 		if err != nil {
 			task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
@@ -193,7 +209,7 @@ func ClassifyTask(base BasePack, task Task) (err error) {
 		}
 	} else {
 		base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
-		vi, confidence, err = runModelNormal(model, def_id, inputImage)
+		vi, confidence, err = runModelNormal(model, def_id, inputImage, data)
 		if err != nil {
 			task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
 			return
--- a/logic/models/train/train.go
+++ b/logic/models/train/train.go
@@ -1191,7 +1191,7 @@ func generateDefinition(c BasePack, model *BaseModel, target_accuracy int, numbe
 	}
 	order++

-	loop := max(1, int((math.Log(float64(model.Width)) / math.Log(float64(10)))))
+	loop := max(1, int(math.Ceil((math.Log(float64(model.Width))/math.Log(float64(10)))))+1)
 	for i := 0; i < loop; i++ {
 		_, err = def.MakeLayer(db, order, LAYER_SIMPLE_BLOCK, "")
 		order++
@@ -1299,7 +1299,7 @@ func generateExpandableDefinition(c BasePack, model *BaseModel, target_accuracy
 	order++

 	// Create the blocks
-	loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
+	loop := int(math.Ceil((math.Log(float64(model.Width)) / math.Log(float64(10))))) + 1

 	/*if model.Width < 50 && model.Height < 50 {
 	loop = 0
--- a/logic/stats/tasks.go
+++ b/logic/stats/tasks.go
@@ -68,7 +68,7 @@ func handleTasksStats(handle *Handle) {
 				} else if task.Status < 2 {
 					total.Classfication_pre += 1
 					hours[hour].Classfication_pre += 1
-				} else if task.Status < 4 {
+				} else if task.Status < 4 || task.Status == 5 {
 					total.Classfication_running += 1
 					hours[hour].Classfication_running += 1
 				}
--- a/logic/tasks/runner/runner.go
+++ b/logic/tasks/runner/runner.go
@@ -19,6 +19,8 @@ import (
 	. "git.andr3h3nriqu3s.com/andr3/fyp/logic/utils"
 )

+var QUEUE_SIZE = 10
+
 /**
 * Actually runs the code
 */
@@ -47,17 +49,28 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
 		Host:   config.Hostname,
 	}

+	loaded_model := RunnerModelData{
+		Id:    nil,
+		Model: nil,
+	}
+
+	count := 0
 	for task := range task_channel {
 		logger.Info("Got task", "task", task)
 		task.UpdateStatusLog(base, TASK_PICKED_UP, "Runner picked up task")

 		if task.TaskType == int(TASK_TYPE_CLASSIFICATION) {
 			logger.Info("Classification Task")
-			if err = ClassifyTask(base, task); err != nil {
+			if err = ClassifyTask(base, task, &loaded_model); err != nil {
 				logger.Error("Classification task failed", "error", err)
 			}

-			back_channel <- index
+			if count == QUEUE_SIZE {
+				back_channel <- index
+				count = 0
+			} else {
+				count += 1
+			}
 			continue
 		} else if task.TaskType == int(TASK_TYPE_TRAINING) {
 			logger.Info("Training Task")
@@ -65,7 +78,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
 				logger.Error("Failed to tain the model", "error", err)
 			}

-			back_channel <- index
+			if count == QUEUE_SIZE {
+				back_channel <- index
+				count = 0
+			} else {
+				count += 1
+			}
 			continue
 		} else if task.TaskType == int(TASK_TYPE_RETRAINING) {
 			logger.Info("Retraining Task")
@@ -73,7 +91,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
 				logger.Error("Failed to tain the model", "error", err)
 			}

-			back_channel <- index
+			if count == QUEUE_SIZE {
+				back_channel <- index
+				count = 0
+			} else {
+				count += 1
+			}
 			continue
 		} else if task.TaskType == int(TASK_TYPE_DELETE_USER) {
 			logger.Warn("User deleting Task")
@@ -81,13 +104,23 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
 				logger.Error("Failed to tain the model", "error", err)
 			}

-			back_channel <- index
+			if count == QUEUE_SIZE {
+				back_channel <- index
+				count = 0
+			} else {
+				count += 1
+			}
 			continue
 		}

 		logger.Error("Do not know how to route task", "task", task)
 		task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Do not know how to route task")
-		back_channel <- index
+		if count == QUEUE_SIZE {
+			back_channel <- index
+			count = 0
+		} else {
+			count += 1
+		}
 	}
 }

@@ -145,7 +178,7 @@ func handleRemoteTask(handler *Handle, base BasePack, runner_id string, task Tas
 /**
 * Tells the orcchestator to look at the task list from time to time
 */
-func attentionSeeker(config Config, back_channel chan int) {
+func attentionSeeker(config Config, db db.Db, back_channel chan int) {
 	logger := log.NewWithOptions(os.Stdout, log.Options{
 		ReportCaller:    true,
 		ReportTimestamp: true,
@@ -170,6 +203,20 @@ func attentionSeeker(config Config, back_channel chan int) {
 	for true {
 		back_channel <- 0

+		for {
+			var s struct {
+				Count int `json:"count(*)"`
+			}
+			err := GetDBOnce(db, &s, "tasks where stauts = 5 or status = 3")
+			if err != nil {
+				break
+			}
+			if s.Count == 0 {
+				break
+			}
+			time.Sleep(t)
+		}
+
 		time.Sleep(t)
 	}
 }
@@ -194,11 +241,16 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
 	}

 	gpu_workers := config.GpuWorker.NumberOfWorkers
+	def_wait, err := time.ParseDuration(config.GpuWorker.Pulling)
+	if err != nil {
+		logger.Error("Failed to load", "error", err)
+		return
+	}

 	logger.Info("Starting runners")

 	task_runners := make([]chan Task, gpu_workers)
-	task_runners_used := make([]bool, gpu_workers)
+	task_runners_used := make([]int, gpu_workers)
 	// One more to accomudate the Attention Seeker channel
 	back_channel := make(chan int, gpu_workers+1)

@@ -213,12 +265,12 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
 		}
 	}()

-	go attentionSeeker(config, back_channel)
+	// go attentionSeeker(config, db, back_channel)

 	// Start the runners
 	for i := 0; i < gpu_workers; i++ {
-		task_runners[i] = make(chan Task, 10)
-		task_runners_used[i] = false
+		task_runners[i] = make(chan Task, QUEUE_SIZE)
+		task_runners_used[i] = 0
 		AddLocalRunner(handler, LocalRunner{
 			RunnerNum: i + 1,
 			Task:      nil,
@@ -226,82 +278,107 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
 		go runner(config, db, task_runners[i], i+1, back_channel)
 	}

-	var task_to_dispatch *Task = nil
-
-	for i := range back_channel {
-
-		if i != 0 {
-			if i > 0 {
-				logger.Info("Runner freed", "runner", i)
-				task_runners_used[i-1] = false
-			} else if i < 0 {
-				logger.Error("Runner died! Restarting!", "runner", i)
-				i = int(math.Abs(float64(i)) - 1)
-				task_runners_used[i] = false
-				go runner(config, db, task_runners[i], i+1, back_channel)
+	used := 0
+	wait := time.Nanosecond * 100
+	for {
+		out := true
+		for out {
+			select {
+			case i := <-back_channel:
+				if i != 0 {
+					if i > 0 {
+						logger.Info("Runner freed", "runner", i)
+						task_runners_used[i-1] = 0
+						used = 0
+					} else if i < 0 {
+						logger.Error("Runner died! Restarting!", "runner", i)
+						i = int(math.Abs(float64(i)) - 1)
+						task_runners_used[i] = 0
+						used = 0
+						go runner(config, db, task_runners[i], i+1, back_channel)
+					}
+					AddLocalTask(handler, int(math.Abs(float64(i))), nil)
+				} else if used == len(task_runners_used) {
+					continue
+				}
+			case <-time.After(wait):
+				if wait == time.Nanosecond*100 {
+					wait = def_wait
+				}
+				out = false
 			}
-			AddLocalTask(handler, int(math.Abs(float64(i))), nil)
 		}

-		if task_to_dispatch == nil {
-			var task TaskT
-			err := GetDBOnce(db, &task, "tasks as t "+
+		for {
+			tasks, err := GetDbMultitple[TaskT](db, "tasks as t "+
 				// Get depenencies
 				"left join tasks_dependencies as td on t.id=td.main_id "+
 				// Get the task that the depencey resolves to
 				"left join tasks as t2 on t2.id=td.dependent_id "+
 				"where t.status=1 "+
-				"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0;")
+				"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0 limit 20;")
 			if err != NotFoundError && err != nil {
 				log.Error("Failed to get tasks from db", "err", err)
 				continue
 			}
-			if err == NotFoundError {
-				task_to_dispatch = nil
-			} else {
-				temp := Task(task)
-				task_to_dispatch = &temp
-			}
-		}
-
-		if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
-			// TODO split tasks into cpu tasks and GPU tasks
-			mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
-			mutex.Lock()
-			remote_runners := handler.DataMap["runners"].(map[string]interface{})
-
-			for k, v := range remote_runners {
-				runner_data := v.(map[string]interface{})
-				runner_info := runner_data["runner_info"].(*Runner)
-
-				if runner_data["task"] != nil {
-					continue
-				}
-
-				if runner_info.UserId != task_to_dispatch.UserId {
-					continue
-				}
-
-				go handleRemoteTask(handler, base, k, *task_to_dispatch)
-				task_to_dispatch = nil
+			if err == NotFoundError || len(tasks) == 0 {
 				break
 			}

-			mutex.Unlock()
-		}
+			for _, task_to_dispatch := range tasks {
+				ttd := Task(*task_to_dispatch)
+				if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
+					// TODO split tasks into cpu tasks and GPU tasks
+					mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
+					mutex.Lock()
+					remote_runners := handler.DataMap["runners"].(map[string]interface{})

-		if task_to_dispatch != nil {
-			for i := 0; i < len(task_runners_used); i += 1 {
-				if !task_runners_used[i] {
-					task_runners[i] <- *task_to_dispatch
-					task_runners_used[i] = true
-					AddLocalTask(handler, i+1, task_to_dispatch)
-					task_to_dispatch = nil
+					for k, v := range remote_runners {
+						runner_data := v.(map[string]interface{})
+						runner_info := runner_data["runner_info"].(*Runner)
+
+						if runner_data["task"] != nil {
+							continue
+						}
+
+						if runner_info.UserId != task_to_dispatch.UserId {
+							continue
+						}
+
+						go handleRemoteTask(handler, base, k, ttd)
+						task_to_dispatch = nil
+						break
+					}
+
+					mutex.Unlock()
+				}
+
+				used = 0
+				if task_to_dispatch != nil {
+					for i := 0; i < len(task_runners_used); i += 1 {
+						if task_runners_used[i] <= QUEUE_SIZE {
+							ttd.UpdateStatusLog(base, TASK_QUEUED, "Runner picked up task")
+							task_runners[i] <- ttd
+							task_runners_used[i] += 1
+							AddLocalTask(handler, i+1, &ttd)
+							task_to_dispatch = nil
+							wait = time.Nanosecond * 100
+							break
+						} else {
+							used += 1
+						}
+					}
+				}
+
+				if used == len(task_runners_used) {
 					break
 				}
 			}
-		}

+			if used == len(task_runners_used) {
+				break
+			}
+		}
 	}
 }

--- a/logic/tasks/utils/utils.go
+++ b/logic/tasks/utils/utils.go
@@ -50,6 +50,7 @@ const (
 	TASK_PREPARING                  = 0
 	TASK_TODO                       = 1
 	TASK_PICKED_UP                  = 2
+	TASK_QUEUED                     = 5
 	TASK_RUNNING                    = 3
 	TASK_DONE                       = 4
 )
--- a/logic/utils/config.go
+++ b/logic/utils/config.go
@@ -102,7 +102,7 @@ func (c *Config) Cleanup(db db.Db) {
 	failLog(err)
 	_, err = db.Exec("update models set status=$1 where status=$2", FAILED_PREPARING, PREPARING)
 	failLog(err)
-	_, err = db.Exec("update tasks set status=$1 where status=$2", TASK_TODO, TASK_PICKED_UP)
+	_, err = db.Exec("update tasks set status=$1 where status=$2 or status=$3", TASK_TODO, TASK_PICKED_UP, TASK_QUEUED)
 	failLog(err)

 	tasks, err := GetDbMultitple[Task](db, "tasks where status=$1", TASK_RUNNING)