closes #49 and possible done #46

This commit is contained in:
2023-10-22 23:02:39 +01:00
parent 90bc3f6acf
commit c844aeabe4
9 changed files with 256 additions and 135 deletions

View File

@@ -10,6 +10,7 @@ import (
"os"
"os/exec"
"path"
"sort"
"strconv"
"text/template"
@@ -43,6 +44,7 @@ const (
MODEL_DEFINITION_STATUS_PRE_INIT ModelDefinitionStatus = 1
MODEL_DEFINITION_STATUS_INIT = 2
MODEL_DEFINITION_STATUS_TRAINING = 3
MODEL_DEFINITION_STATUS_PAUSED_TRAINING = 6
MODEL_DEFINITION_STATUS_TRANIED = 4
MODEL_DEFINITION_STATUS_READY = 5
)
@@ -50,10 +52,10 @@ const (
type LayerType int
const (
LAYER_INPUT LayerType = 1
LAYER_DENSE = 2
LAYER_FLATTEN = 3
LAYER_SIMPLE_BLOCK = 4
LAYER_INPUT LayerType = 1
LAYER_DENSE = 2
LAYER_FLATTEN = 3
LAYER_SIMPLE_BLOCK = 4
)
func ModelDefinitionUpdateStatus(c *Context, id string, status ModelDefinitionStatus) (err error) {
@@ -142,6 +144,7 @@ func trainDefinition(c *Context, model *BaseModel, definition_id string, load_pr
if err != nil {
return
}
defer os.RemoveAll(run_path)
_, err = generateCvs(c, run_path, model.Id)
if err != nil {
@@ -174,29 +177,24 @@ func trainDefinition(c *Context, model *BaseModel, definition_id string, load_pr
"DefId": definition_id,
"LoadPrev": load_prev,
"LastModelRunPath": path.Join(getDir(), result_path, "model.keras"),
"SaveModelPath": path.Join(getDir(), result_path),
}); err != nil {
return
}
// Run the command
out, err := exec.Command("bash", "-c", fmt.Sprintf("cd %s && python run.py", run_path)).Output()
out, err := exec.Command("bash", "-c", fmt.Sprintf("cd %s && python run.py", run_path)).CombinedOutput()
if err != nil {
c.Logger.Debug(string(out))
return
}
c.Logger.Info("Python finished running")
if err = os.MkdirAll(result_path, os.ModePerm); err != nil {
return
}
if err = exec.Command("cp", "-r", path.Join(run_path, "model"), path.Join(result_path, "model")).Run(); err != nil {
return
}
if err = exec.Command("cp", "-r", path.Join(run_path, "model.keras"), path.Join(result_path, "model.keras")).Run(); err != nil {
return
}
accuracy_file, err := os.Open(path.Join(run_path, "accuracy.val"))
if err != nil {
return
@@ -214,8 +212,6 @@ func trainDefinition(c *Context, model *BaseModel, definition_id string, load_pr
}
c.Logger.Info("Model finished training!", "accuracy", accuracy)
os.RemoveAll(run_path)
return
}
@@ -236,6 +232,29 @@ func remove[T interface{}](lst []T, i int) []T {
return append(lst[:i], lst[i+1:]...)
}
type TrainModelRow struct {
id string
target_accuracy int
epoch int
acuracy float64
}
type TraingModelRowDefinitions []TrainModelRow
func (nf TraingModelRowDefinitions) Len() int { return len(nf) }
func (nf TraingModelRowDefinitions) Swap(i, j int) { nf[i], nf[j] = nf[j], nf[i] }
func (nf TraingModelRowDefinitions) Less(i, j int) bool {
return nf[i].acuracy < nf[j].acuracy
}
type ToRemoveList []int
func (nf ToRemoveList) Len() int { return len(nf) }
func (nf ToRemoveList) Swap(i, j int) { nf[i], nf[j] = nf[j], nf[i] }
func (nf ToRemoveList) Less(i, j int) bool {
return nf[i] < nf[j]
}
func trainModel(c *Context, model *BaseModel) {
definitionsRows, err := c.Db.Query("select id, target_accuracy, epoch from model_definition where status=$1 and model_id=$2", MODEL_DEFINITION_STATUS_INIT, model.Id)
if err != nil {
@@ -246,16 +265,11 @@ func trainModel(c *Context, model *BaseModel) {
}
defer definitionsRows.Close()
type row struct {
id string
target_accuracy int
epoch int
}
definitions := []row{}
var definitions TraingModelRowDefinitions = []TrainModelRow{}
for definitionsRows.Next() {
var rowv row
var rowv TrainModelRow
rowv.acuracy = 0
if err = definitionsRows.Scan(&rowv.id, &rowv.target_accuracy, &rowv.epoch); err != nil {
c.Logger.Error("Failed to train Model Could not read definition from db!Err:")
c.Logger.Error(err)
@@ -271,23 +285,23 @@ func trainModel(c *Context, model *BaseModel) {
return
}
toTrain := len(definitions)
firstRound := true
var newDefinitions = []row{}
copy(newDefinitions, definitions)
finished := false
for {
var toRemove ToRemoveList = []int{}
for i, def := range definitions {
ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_TRAINING)
accuracy, err := trainDefinition(c, model, def.id, !firstRound)
if err != nil {
c.Logger.Error("Failed to train definition!Err:", "err", err)
ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_FAILED_TRAINING)
toTrain = toTrain - 1
newDefinitions = remove(newDefinitions, i)
toRemove = append(toRemove, i)
continue
}
def.epoch += EPOCH_PER_RUN
accuracy = accuracy * 100
def.acuracy = accuracy
if accuracy >= float64(def.target_accuracy) {
c.Logger.Info("Found a definition that reaches target_accuracy!")
@@ -305,30 +319,68 @@ func trainModel(c *Context, model *BaseModel) {
return
}
toTrain = 0
finished = true
break
}
if def.epoch > MAX_EPOCH {
fmt.Printf("Failed to train definition! Accuracy less %f < %d\n", accuracy, def.target_accuracy)
ModelDefinitionUpdateStatus(c, def.id, MODEL_DEFINITION_STATUS_FAILED_TRAINING)
toTrain = toTrain - 1
newDefinitions = remove(newDefinitions, i)
toRemove = append(toRemove, i)
continue
}
_, err = c.Db.Exec("update model_definition set accuracy=$1, epoch=$2 where id=$3", accuracy, def.epoch, def.id)
if err != nil {
c.Logger.Error("Failed to train definition!Err:\n", "err", err)
ModelUpdateStatus(c, model.Id, FAILED_TRAINING)
return
}
_, err = c.Db.Exec("update model_definition set accuracy=$1, epoch=$2, status=$3 where id=$4", accuracy, def.epoch, MODEL_DEFINITION_STATUS_PAUSED_TRAINING, def.id)
if err != nil {
c.Logger.Error("Failed to train definition!Err:\n", "err", err)
ModelUpdateStatus(c, model.Id, FAILED_TRAINING)
return
}
}
copy(definitions, newDefinitions)
firstRound = false
if toTrain == 0 {
if finished {
break
}
sort.Reverse(toRemove)
c.Logger.Info("Round done", "toRemove", toRemove)
for _, n := range toRemove {
definitions = remove(definitions, n)
}
len_def := len(definitions)
if len_def == 0 {
break
}
if len_def == 1 {
continue
}
sort.Sort(definitions)
acc := definitions[0].acuracy - 20
c.Logger.Info("Training models, Highest acc", "acc", acc)
toRemove = []int{}
for i, def := range definitions {
if def.acuracy < acc {
toRemove = append(toRemove, i)
}
}
c.Logger.Info("Removing due to accuracy", "toRemove", toRemove)
sort.Reverse(toRemove)
for _, n := range toRemove {
c.Logger.Warn("Removing definition not fast enough learning", "n", n)
definitions = remove(definitions, n)
}
}
rows, err := c.Db.Query("select id from model_definition where model_id=$1 and status=$2 order by accuracy desc limit 1;", model.Id, MODEL_DEFINITION_STATUS_TRANIED)
@@ -437,14 +489,26 @@ func generateDefinition(c *Context, model *BaseModel, target_accuracy int, numbe
return failed()
}
order := 1;
order := 1
// Note the shape for now is no used
err = MakeLayer(c.Db, def_id, order, LAYER_INPUT, fmt.Sprintf("%d,%d,1", model.Width, model.Height))
if err != nil {
return failed()
// Note the shape of the first layer defines the import size
if complexity == 2 {
// Note the shape for now is no used
width := int(math.Pow(2, math.Floor(math.Log(float64(model.Width))/math.Log(2.0))))
height := int(math.Pow(2, math.Floor(math.Log(float64(model.Height))/math.Log(2.0))))
c.Logger.Warn("Complexity 2 creating model with smaller size", "width", width, "height", height)
err = MakeLayer(c.Db, def_id, order, LAYER_INPUT, fmt.Sprintf("%d,%d,1", width, height))
if err != nil {
return failed()
}
order++
} else {
err = MakeLayer(c.Db, def_id, order, LAYER_INPUT, fmt.Sprintf("%d,%d,1", model.Width, model.Height))
if err != nil {
return failed()
}
order++
}
order++;
if complexity == 0 {
@@ -452,12 +516,12 @@ func generateDefinition(c *Context, model *BaseModel, target_accuracy int, numbe
if err != nil {
return failed()
}
order++;
order++
loop := int(math.Log2(float64(number_of_classes)))
for i := 0; i < loop; i++ {
err = MakeLayer(c.Db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
order++;
order++
if err != nil {
ModelUpdateStatus(c, model.Id, FAILED_PREPARING_TRAINING)
// TODO improve this response
@@ -465,17 +529,17 @@ func generateDefinition(c *Context, model *BaseModel, target_accuracy int, numbe
}
}
} else if (complexity == 1) {
} else if complexity == 1 {
loop := int((math.Log(float64(model.Width))/math.Log(float64(10))))
if loop == 0 {
loop = 1;
}
loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
if loop == 0 {
loop = 1
}
for i := 0; i < loop; i++ {
err = MakeLayer(c.Db, def_id, order, LAYER_SIMPLE_BLOCK, "")
order++;
order++
if err != nil {
return failed();
return failed()
}
}
@@ -483,17 +547,49 @@ func generateDefinition(c *Context, model *BaseModel, target_accuracy int, numbe
if err != nil {
return failed()
}
order++;
order++
loop = int((math.Log(float64(number_of_classes))/math.Log(float64(10)))/2)
if loop == 0 {
loop = 1;
}
loop = int((math.Log(float64(number_of_classes)) / math.Log(float64(10))) / 2)
if loop == 0 {
loop = 1
}
for i := 0; i < loop; i++ {
err = MakeLayer(c.Db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
order++;
order++
if err != nil {
return failed();
return failed()
}
}
} else if complexity == 2 {
loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
if loop == 0 {
loop = 1
}
for i := 0; i < loop; i++ {
err = MakeLayer(c.Db, def_id, order, LAYER_SIMPLE_BLOCK, "")
order++
if err != nil {
return failed()
}
}
err = MakeLayer(c.Db, def_id, order, LAYER_FLATTEN, "")
if err != nil {
return failed()
}
order++
loop = int((math.Log(float64(number_of_classes)) / math.Log(float64(10))) / 2)
if loop == 0 {
loop = 1
}
for i := 0; i < loop; i++ {
err = MakeLayer(c.Db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
order++
if err != nil {
return failed()
}
}
@@ -523,19 +619,26 @@ func generateDefinitions(c *Context, model *BaseModel, target_accuracy int, numb
return c.Error500(err)
}
if (number_of_models == 1) {
if (model.Width < 100 && model.Height < 100 && len(cls) < 30) {
generateDefinition(c, model, target_accuracy, len(cls), 0)
} else {
generateDefinition(c, model, target_accuracy, len(cls), 1)
}
} else {
// TODO handle incrisea the complexity
for i := 0; i < number_of_models; i++ {
generateDefinition(c, model, target_accuracy, len(cls), 0)
}
}
cls_len := len(cls)
if number_of_models == 1 {
if model.Width < 100 && model.Height < 100 && cls_len < 30 {
generateDefinition(c, model, target_accuracy, cls_len, 0)
} else if model.Width > 100 && model.Height > 100 {
generateDefinition(c, model, target_accuracy, cls_len, 2)
} else {
generateDefinition(c, model, target_accuracy, cls_len, 1)
}
} else if number_of_models == 3 {
for i := 0; i < number_of_models; i++ {
generateDefinition(c, model, target_accuracy, cls_len, i)
}
} else {
// TODO handle incrisea the complexity
for i := 0; i < number_of_models; i++ {
generateDefinition(c, model, target_accuracy, cls_len, 0)
}
}
return nil
}
@@ -624,14 +727,14 @@ func handleTrain(handle *Handle) {
f := r.URL.Query()
accuracy := 0.0
accuracy := 0.0
if !CheckId(f, "model_id") || !CheckId(f, "definition") || CheckEmpty(f, "epoch") || !CheckFloat64(f, "accuracy", &accuracy){
if !CheckId(f, "model_id") || !CheckId(f, "definition") || CheckEmpty(f, "epoch") || !CheckFloat64(f, "accuracy", &accuracy) {
c.Logger.Warn("Invalid: model_id or definition or epoch or accuracy")
return c.UnsafeErrorCode(nil, 400, nil)
}
accuracy = accuracy * 100
accuracy = accuracy * 100
model_id := f.Get("model_id")
def_id := f.Get("definition")
@@ -665,7 +768,7 @@ func handleTrain(handle *Handle) {
return c.UnsafeErrorCode(nil, 400, nil)
}
c.Logger.Info("Updated model_definition!", "model", model_id, "progress", epoch, "accuracy", accuracy)
c.Logger.Info("Updated model_definition!", "model", model_id, "progress", epoch, "accuracy", accuracy)
_, err = c.Db.Exec("update model_definition set epoch_progress=$1, accuracy=$2 where id=$3", epoch, accuracy, def_id)
if err != nil {