more worker on go-runner

This commit is contained in:
2024-05-07 01:16:38 +01:00
parent b1e4211e6a
commit 29846012e7
17 changed files with 151 additions and 3249 deletions

View File

@@ -1,6 +1,8 @@
package tasks
import (
"os"
"path"
"sync"
"time"
@@ -383,4 +385,149 @@ func handleRemoteRunner(x *Handle) {
Training: training_points,
})
})
type RunnerTrainDefEpoch struct {
Id string `json:"id" validate:"required"`
TaskId string `json:"taskId" validate:"required"`
DefId string `json:"defId" validate:"required"`
Epoch int `json:"epoch" validate:"required"`
Accuracy float64 `json:"accuracy" validate:"required"`
}
PostAuthJson(x, "/tasks/runner/train/epoch", User_Normal, func(c *Context, dat *RunnerTrainDefEpoch) *Error {
_, error := verifyRunner(c, &JustId{Id: dat.Id})
if error != nil {
return error
}
task, error := verifyTask(x, c, &VerifyTask{
Id: dat.Id,
TaskId: dat.TaskId,
})
if error != nil {
return error
}
if task.TaskType != int(TASK_TYPE_TRAINING) {
c.Logger.Error("Task not is not the right type to get the definitions", "task type", task.TaskType)
return c.JsonBadRequest("Task is not the right type go get the definitions")
}
def, err := GetDefinition(c, dat.DefId)
if err != nil {
return c.E500M("Failed to get definition information", err)
}
err = def.UpdateAfterEpoch(c, dat.Accuracy, dat.Epoch)
if err != nil {
return c.E500M("Failed to update model", err)
}
return c.SendJSON("Ok")
})
PostAuthJson(x, "/task/runner/train/mark-failed", User_Normal, func(c *Context, dat *VerifyTask) *Error {
_, error := verifyRunner(c, &JustId{Id: dat.Id})
if error != nil {
return error
}
task, error := verifyTask(x, c, &VerifyTask{
Id: dat.Id,
TaskId: dat.TaskId,
})
if error != nil {
return error
}
if task.TaskType != int(TASK_TYPE_TRAINING) {
c.Logger.Error("Task not is not the right type to get the definitions", "task type", task.TaskType)
return c.JsonBadRequest("Task is not the right type go get the definitions")
}
_, err := c.Exec(
"update model_definition set status=$1 "+
"where model_id=$2 and status in ($3, $4)",
MODEL_DEFINITION_STATUS_CANCELD_TRAINING,
task.ModelId,
MODEL_DEFINITION_STATUS_TRAINING,
MODEL_DEFINITION_STATUS_PAUSED_TRAINING,
)
if err != nil {
return c.E500M("Failed to mark definition as failed", err)
}
return c.SendJSON("Ok")
})
PostAuthJson(x, "/task/runner/train/done", User_Normal, func(c *Context, dat *VerifyTask) *Error {
_, error := verifyRunner(c, &JustId{Id: dat.Id})
if error != nil {
return error
}
task, error := verifyTask(x, c, dat)
if error != nil {
return error
}
if task.TaskType != int(TASK_TYPE_TRAINING) {
c.Logger.Error("Task not is not the right type to get the definitions", "task type", task.TaskType)
return c.JsonBadRequest("Task is not the right type go get the definitions")
}
model, err := GetBaseModel(c, *task.ModelId)
if err != nil {
c.Logger.Error("Failed to get model", "err", err)
return c.E500M("Failed to get mode", err)
}
var def Definition
err = GetDBOnce(c, &def, "from model_definition as md where model_id=$1 and status=$2 order by accuracy desc limit 1;", task.ModelId, DEFINITION_STATUS_TRANIED)
if err == NotFoundError {
// TODO Make the Model status have a message
c.Logger.Error("All definitions failed to train!")
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "All definition failed to train!")
return c.SendJSON("Ok")
} else if err != nil {
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "Failed to get model definition")
return c.E500M("Failed to get model definition", err)
}
if err = def.UpdateStatus(c, DEFINITION_STATUS_READY); err != nil {
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "Failed to update model definition")
return c.E500M("Failed to update model definition", err)
}
to_delete, err := c.Query("select id from model_definition where status != $1 and model_id=$2", MODEL_DEFINITION_STATUS_READY, model.Id)
if err != nil {
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "Failed to delete unsed definitions")
return c.E500M("Failed to delete unsed definitions", err)
}
defer to_delete.Close()
for to_delete.Next() {
var id string
if err = to_delete.Scan(&id); err != nil {
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "Failed to delete unsed definitions")
return c.E500M("Failed to delete unsed definitions", err)
}
os.RemoveAll(path.Join("savedData", model.Id, "defs", id))
}
// TODO Check if returning also works here
if _, err = c.Exec("delete from model_definition where status!=$1 and model_id=$2;", MODEL_DEFINITION_STATUS_READY, model.Id); err != nil {
model.UpdateStatus(c, FAILED_TRAINING)
task.UpdateStatusLog(c, TASK_FAILED_RUNNING, "Failed to delete unsed definitions")
return c.E500M("Failed to delete unsed definitions", err)
}
model.UpdateStatus(c, READY)
return c.SendJSON("Ok")
})
}