diff --git a/logic/db_types/layer.go b/logic/db_types/layer.go index 6897770..5e00ad3 100644 --- a/logic/db_types/layer.go +++ b/logic/db_types/layer.go @@ -2,8 +2,10 @@ package dbtypes import ( "encoding/json" + "fmt" "git.andr3h3nriqu3s.com/andr3/fyp/logic/db" + "github.com/charmbracelet/log" ) type LayerType int @@ -24,9 +26,24 @@ type Layer struct { ExpType int `db:"mdl.exp_type" json:"exp_type"` } +func (x *Layer) ShapeToSize() error { + v := x.GetShape() + switch x.LayerType { + case LAYER_INPUT: + x.Shape = fmt.Sprintf("%d,%d", v[1], v[2]) + case LAYER_DENSE: + x.Shape = fmt.Sprintf("(%d)", v[0]) + default: + x.Shape = "ERROR" + // DO nothing + } + return nil +} + func ShapeToString(args ...int) string { text, err := json.Marshal(args) if err != nil { + log.Error("json err!", "err", err) panic("Could not generate Shape") } return string(text) @@ -35,12 +52,16 @@ func ShapeToString(args ...int) string { func StringToShape(str string) (shape []int64) { err := json.Unmarshal([]byte(str), &shape) if err != nil { + log.Error("json err!", "err", err) panic("Could not parse Shape") } return } func (l Layer) GetShape() []int64 { + if l.Shape == "" { + return []int64{} + } return StringToShape(l.Shape) } diff --git a/logic/models/train/train.go b/logic/models/train/train.go index e911d59..cba1a22 100644 --- a/logic/models/train/train.go +++ b/logic/models/train/train.go @@ -161,40 +161,23 @@ func generateCvsExp(c BasePack, run_path string, model_id string, doPanic bool) return } -func trainDefinition(c BasePack, model *BaseModel, definition_id string, load_prev bool) (accuracy float64, err error) { +func trainDefinition(c BasePack, model *BaseModel, def Definition, load_prev bool) (accuracy float64, err error) { l := c.GetLogger() - db := c.GetDb() l.Warn("About to start training definition") accuracy = 0 - layers, err := db.Query("select layer_type, shape from model_definition_layer where def_id=$1 order by layer_order asc;", definition_id) + + layers, err := def.GetLayers(c.GetDb(), " order by layer_order asc;") if err != nil { return } - defer layers.Close() - type layerrow struct { - LayerType int - Shape string - LayerNum int - } - - got := []layerrow{} - i := 1 - - for layers.Next() { - var row = layerrow{} - if err = layers.Scan(&row.LayerType, &row.Shape); err != nil { - return - } - row.Shape = shapeToSize(row.Shape) - row.LayerNum = 1 - got = append(got, row) - i = i + 1 + for _, layer := range layers { + layer.ShapeToSize() } // Generate run folder - run_path := path.Join("/tmp", model.Id, "defs", definition_id) + run_path := path.Join("/tmp", model.Id, "defs", def.Id) err = os.MkdirAll(run_path, os.ModePerm) if err != nil { @@ -219,17 +202,17 @@ func trainDefinition(c BasePack, model *BaseModel, definition_id string, load_pr } // Copy result around - result_path := path.Join("savedData", model.Id, "defs", definition_id) + result_path := path.Join("savedData", model.Id, "defs", def.Id) if err = tmpl.Execute(f, AnyMap{ - "Layers": got, - "Size": got[0].Shape, + "Layers": layers, + "Size": layers[0].Shape, "DataDir": path.Join(getDir(), "savedData", model.Id, "data"), "RunPath": run_path, "ColorMode": model.ImageMode, "Model": model, "EPOCH_PER_RUN": EPOCH_PER_RUN, - "DefId": definition_id, + "DefId": def.Id, "LoadPrev": load_prev, "LastModelRunPath": path.Join(getDir(), result_path, "model.keras"), "SaveModelPath": path.Join(getDir(), result_path), @@ -727,30 +710,16 @@ func trainModel(c BasePack, model *BaseModel) (err error) { db := c.GetDb() l := c.GetLogger() - definitionsRows, err := db.Query("select id, target_accuracy, epoch from model_definition where status=$1 and model_id=$2", MODEL_DEFINITION_STATUS_INIT, model.Id) + defs_, err := model.GetDefinitions(db, "and md.status=$2", MODEL_DEFINITION_STATUS_INIT) if err != nil { - l.Error("Failed to train Model! Err:") - l.Error(err) + l.Error("Failed to train Model!", "err", err) ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) return } - defer definitionsRows.Close() - var definitions TraingModelRowDefinitions = []TrainModelRow{} + var defs SortByAccuracyDefinitions = defs_ - for definitionsRows.Next() { - var rowv TrainModelRow - rowv.acuracy = 0 - if err = definitionsRows.Scan(&rowv.id, &rowv.target_accuracy, &rowv.epoch); err != nil { - l.Error("Failed to train Model Could not read definition from db!Err:") - l.Error(err) - ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) - return - } - definitions = append(definitions, rowv) - } - - if len(definitions) == 0 { + if len(defs) == 0 { l.Error("No Definitions defined!") ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) return @@ -761,32 +730,29 @@ func trainModel(c BasePack, model *BaseModel) (err error) { for { var toRemove ToRemoveList = []int{} - for i, def := range definitions { - ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_TRAINING) - accuracy, err := trainDefinition(c, model, def.id, !firstRound) + for i, def := range defs { + ModelDefinitionUpdateStatus(c, def.Id, MODEL_DEFINITION_STATUS_TRAINING) + accuracy, err := trainDefinition(c, model, *def, !firstRound) if err != nil { l.Error("Failed to train definition!Err:", "err", err) - ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) + ModelDefinitionUpdateStatus(c, def.Id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) toRemove = append(toRemove, i) continue } - def.epoch += EPOCH_PER_RUN + def.Epoch += EPOCH_PER_RUN accuracy = accuracy * 100 - def.acuracy = float64(accuracy) + def.Accuracy = float64(accuracy) - definitions[i].epoch += EPOCH_PER_RUN - definitions[i].acuracy = accuracy - - if accuracy >= float64(def.target_accuracy) { + if accuracy >= float64(def.TargetAccuracy) { l.Info("Found a definition that reaches target_accuracy!") - _, err = db.Exec("update model_definition set accuracy=$1, status=$2, epoch=$3 where id=$4", accuracy, MODEL_DEFINITION_STATUS_TRANIED, def.epoch, def.id) + _, err = db.Exec("update model_definition set accuracy=$1, status=$2, epoch=$3 where id=$4", accuracy, MODEL_DEFINITION_STATUS_TRANIED, def.Epoch, def.Id) if err != nil { l.Error("Failed to train definition!Err:\n", "err", err) ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) return err } - _, err = db.Exec("update model_definition set status=$1 where id!=$2 and model_id=$3 and status!=$4", MODEL_DEFINITION_STATUS_CANCELD_TRAINING, def.id, model.Id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) + _, err = db.Exec("update model_definition set status=$1 where id!=$2 and model_id=$3 and status!=$4", MODEL_DEFINITION_STATUS_CANCELD_TRAINING, def.Id, model.Id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) if err != nil { l.Error("Failed to train definition!Err:\n", "err", err) ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) @@ -797,14 +763,14 @@ func trainModel(c BasePack, model *BaseModel) (err error) { break } - if def.epoch > MAX_EPOCH { - fmt.Printf("Failed to train definition! Accuracy less %f < %d\n", accuracy, def.target_accuracy) - ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) + if def.Epoch > MAX_EPOCH { + fmt.Printf("Failed to train definition! Accuracy less %f < %d\n", accuracy, def.TargetAccuracy) + ModelDefinitionUpdateStatus(c, def.Id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) toRemove = append(toRemove, i) continue } - _, err = db.Exec("update model_definition set accuracy=$1, epoch=$2, status=$3 where id=$4", accuracy, def.epoch, MODEL_DEFINITION_STATUS_PAUSED_TRAINING, def.id) + _, err = db.Exec("update model_definition set accuracy=$1, epoch=$2, status=$3 where id=$4", accuracy, def.Epoch, MODEL_DEFINITION_STATUS_PAUSED_TRAINING, def.Id) if err != nil { l.Error("Failed to train definition!Err:\n", "err", err) ModelUpdateStatus(c, model.Id, int(FAILED_TRAINING)) @@ -822,28 +788,26 @@ func trainModel(c BasePack, model *BaseModel) (err error) { l.Info("Round done", "toRemove", toRemove) for _, n := range toRemove { - definitions = remove(definitions, n) + defs = remove(defs, n) } - len_def := len(definitions) + len_def := len(defs) if len_def == 0 { break - } - - if len_def == 1 { + } else if len_def == 1 { continue } - sort.Sort(sort.Reverse(definitions)) + sort.Sort(sort.Reverse(defs)) - acc := definitions[0].acuracy - 20.0 + acc := defs[0].Accuracy - 20.0 - l.Info("Training models, Highest acc", "acc", definitions[0].acuracy, "mod_acc", acc) + l.Info("Training models, Highest acc", "acc", defs[0].Accuracy, "mod_acc", acc) toRemove = []int{} - for i, def := range definitions { - if def.acuracy < acc { + for i, def := range defs { + if def.Accuracy < acc { toRemove = append(toRemove, i) } } @@ -853,8 +817,8 @@ func trainModel(c BasePack, model *BaseModel) (err error) { sort.Sort(sort.Reverse(toRemove)) for _, n := range toRemove { l.Warn("Removing definition not fast enough learning", "n", n) - ModelDefinitionUpdateStatus(c, definitions[n].id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) - definitions = remove(definitions, n) + ModelDefinitionUpdateStatus(c, defs[n].Id, MODEL_DEFINITION_STATUS_FAILED_TRAINING) + defs = remove(defs, n) } } diff --git a/logic/tasks/runner/runner.go b/logic/tasks/runner/runner.go index 38c3cdc..ca660ab 100644 --- a/logic/tasks/runner/runner.go +++ b/logic/tasks/runner/runner.go @@ -259,21 +259,8 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) { } } - if task_to_dispatch != nil { - - // Only let CPU tasks be done by the local users - if task_to_dispatch.TaskType == int(TASK_TYPE_DELETE_USER) { - for i := 0; i < len(task_runners_used); i += 1 { - if !task_runners_used[i] { - task_runners[i] <- *task_to_dispatch - task_runners_used[i] = true - task_to_dispatch = nil - break - } - } - continue - } - + if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) { + // TODO split tasks into cpu tasks and GPU tasks mutex := handler.DataMap["runners_mutex"].(*sync.Mutex) mutex.Lock() remote_runners := handler.DataMap["runners"].(map[string]interface{}) @@ -286,14 +273,27 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) { continue } - if runner_info.UserId == task_to_dispatch.UserId { - go handleRemoteTask(handler, base, k, *task_to_dispatch) + if runner_info.UserId != task_to_dispatch.UserId { + continue + } + + go handleRemoteTask(handler, base, k, *task_to_dispatch) + task_to_dispatch = nil + break + } + + mutex.Unlock() + } + + if task_to_dispatch != nil { + for i := 0; i < len(task_runners_used); i += 1 { + if !task_runners_used[i] { + task_runners[i] <- *task_to_dispatch + task_runners_used[i] = true task_to_dispatch = nil break } } - - mutex.Unlock() } }