Fixed the model not training and running forever
This commit is contained in:
parent
2fa7680d0b
commit
29b69deaf6
@ -8,6 +8,7 @@ import (
|
||||
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/db_types"
|
||||
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/tasks/utils"
|
||||
|
||||
"github.com/charmbracelet/log"
|
||||
tf "github.com/galeone/tensorflow/tensorflow/go"
|
||||
"github.com/galeone/tensorflow/tensorflow/go/op"
|
||||
tg "github.com/galeone/tfgo"
|
||||
@ -19,6 +20,7 @@ func ReadPNG(scope *op.Scope, imagePath string, channels int64) *image.Image {
|
||||
contents := op.ReadFile(scope.SubScope("ReadFile"), op.Const(scope.SubScope("filename"), imagePath))
|
||||
output := op.DecodePng(scope.SubScope("DecodePng"), contents, op.DecodePngChannels(channels))
|
||||
output = op.ExpandDims(scope.SubScope("ExpandDims"), output, op.Const(scope.SubScope("axis"), []int32{0}))
|
||||
output = op.ExpandDims(scope.SubScope("Stack"), output, op.Const(scope.SubScope("axis"), []int32{1}))
|
||||
image := &image.Image{
|
||||
Tensor: tg.NewTensor(scope, output)}
|
||||
return image.Scale(0, 255)
|
||||
@ -29,6 +31,7 @@ func ReadJPG(scope *op.Scope, imagePath string, channels int64) *image.Image {
|
||||
contents := op.ReadFile(scope.SubScope("ReadFile"), op.Const(scope.SubScope("filename"), imagePath))
|
||||
output := op.DecodePng(scope.SubScope("DecodeJpeg"), contents, op.DecodePngChannels(channels))
|
||||
output = op.ExpandDims(scope.SubScope("ExpandDims"), output, op.Const(scope.SubScope("axis"), []int32{0}))
|
||||
output = op.ExpandDims(scope.SubScope("Stack"), output, op.Const(scope.SubScope("axis"), []int32{1}))
|
||||
image := &image.Image{
|
||||
Tensor: tg.NewTensor(scope, output)}
|
||||
return image.Scale(0, 255)
|
||||
@ -49,6 +52,8 @@ func runModelNormal(base BasePack, model *BaseModel, def_id string, inputImage *
|
||||
var vmax float32 = 0.0
|
||||
var predictions = results[0].Value().([][]float32)[0]
|
||||
|
||||
log.Info("preds", "preds", predictions)
|
||||
|
||||
for i, v := range predictions {
|
||||
if v > vmax {
|
||||
order = i
|
||||
@ -62,10 +67,13 @@ func runModelNormal(base BasePack, model *BaseModel, def_id string, inputImage *
|
||||
}
|
||||
|
||||
func runModelExp(base BasePack, model *BaseModel, def_id string, inputImage *tf.Tensor) (order int, confidence float32, err error) {
|
||||
log := base.GetLogger()
|
||||
|
||||
err = nil
|
||||
order = 0
|
||||
|
||||
log.Info("Running base")
|
||||
|
||||
base_model := tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "base", "model"), []string{"serve"}, nil)
|
||||
|
||||
//results := base_model.Exec([]tf.Output{
|
||||
@ -86,7 +94,7 @@ func runModelExp(base BasePack, model *BaseModel, def_id string, inputImage *tf.
|
||||
return
|
||||
}
|
||||
|
||||
base.GetLogger().Info("test", "count", len(heads))
|
||||
log.Info("Running heads", "heads", heads)
|
||||
|
||||
var vmax float32 = 0.0
|
||||
|
||||
|
@ -1185,7 +1185,14 @@ func splitModel(c BasePack, model *BaseModel) (err error) {
|
||||
count := -1
|
||||
|
||||
for layers.Next() {
|
||||
var layerrow layerrow
|
||||
if err = layers.Scan(&layerrow.ExpType); err != nil {
|
||||
return
|
||||
}
|
||||
count += 1
|
||||
if layerrow.ExpType == 2 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if count == -1 {
|
||||
@ -1294,63 +1301,32 @@ func generateDefinition(c BasePack, model *BaseModel, target_accuracy int, numbe
|
||||
order++
|
||||
}
|
||||
|
||||
if complexity == 0 {
|
||||
err = MakeLayer(db, def_id, order, LAYER_FLATTEN, "")
|
||||
loop := max(int((math.Log(float64(model.Width)) / math.Log(float64(10)))), 1)
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_SIMPLE_BLOCK, "")
|
||||
order++
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
order++
|
||||
}
|
||||
|
||||
loop := int(math.Log2(float64(number_of_classes)))
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
|
||||
order++
|
||||
if err != nil {
|
||||
ModelUpdateStatus(c, model.Id, FAILED_PREPARING_TRAINING)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
} else if complexity == 1 || complexity == 2 {
|
||||
|
||||
loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
|
||||
if loop == 0 {
|
||||
loop = 1
|
||||
}
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_SIMPLE_BLOCK, "")
|
||||
order++
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
err = MakeLayer(db, def_id, order, LAYER_FLATTEN, "")
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
order++
|
||||
|
||||
loop = int((math.Log(float64(number_of_classes)) / math.Log(float64(10))) / 2)
|
||||
if loop == 0 {
|
||||
loop = 1
|
||||
}
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
|
||||
order++
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.Error("Unkown complexity", "complexity", complexity)
|
||||
err = MakeLayer(db, def_id, order, LAYER_FLATTEN, "")
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
order++
|
||||
|
||||
loop = max(int((math.Log(float64(number_of_classes))/math.Log(float64(10)))/2), 1)
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
|
||||
order++
|
||||
if err != nil {
|
||||
failed()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
err = ModelDefinitionUpdateStatus(c, def_id, MODEL_DEFINITION_STATUS_INIT)
|
||||
if err != nil {
|
||||
@ -1486,10 +1462,10 @@ func generateExpandableDefinition(c BasePack, model *BaseModel, target_accuracy
|
||||
|
||||
log.Info("Size of the dense layers", "loop", loop)
|
||||
|
||||
// loop = max(loop, 3)
|
||||
loop = max(loop, 3)
|
||||
|
||||
for i := 0; i < loop; i++ {
|
||||
err = MakeLayer(db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)))
|
||||
err = MakeLayerExpandable(db, def_id, order, LAYER_DENSE, fmt.Sprintf("%d,1", number_of_classes*(loop-i)*2), 2)
|
||||
order++
|
||||
if err != nil {
|
||||
failed()
|
||||
@ -1712,10 +1688,22 @@ func RunTaskRetrain(b BasePack, task Task) (err error) {
|
||||
return
|
||||
}
|
||||
|
||||
failed = func() {
|
||||
ResetClasses(b, model)
|
||||
ModelUpdateStatus(b, model.Id, READY_RETRAIN_FAILED)
|
||||
task.UpdateStatusLog(b, TASK_FAILED_RUNNING, "Model failed retraining")
|
||||
_, err_ := db.Exec("delete from exp_model_head where def_id=$1 and status in (2,3)", defData.Id)
|
||||
if err_ != nil {
|
||||
panic(err_)
|
||||
}
|
||||
l.Error("Failed to retrain", "err", err)
|
||||
}
|
||||
|
||||
var acc float64 = 0
|
||||
var epocs = 0
|
||||
// TODO make max epochs come from db
|
||||
for acc*100 < defData.TargetAcuuracy && epocs < 20 {
|
||||
// TODO re increase the target accuracy
|
||||
for acc*100 < defData.TargetAcuuracy-5 && epocs < 10 {
|
||||
// This is something I have to check
|
||||
acc, err = trainDefinitionExpandExp(b, model, defData.Id, epocs > 0)
|
||||
if err != nil {
|
||||
|
@ -9,9 +9,9 @@ import requests
|
||||
class NotifyServerCallback(tf.keras.callbacks.Callback):
|
||||
def on_epoch_end(self, epoch, log, *args, **kwargs):
|
||||
{{ if .HeadId }}
|
||||
requests.get(f'{{ .Host }}/api/model/head/epoch/update?epoch={epoch + 1}&accuracy={log["accuracy"]}&head_id={{.HeadId}}')
|
||||
requests.get(f'{{ .Host }}/api/model/head/epoch/update?epoch={epoch + 1}&accuracy={log["val_accuracy"]}&head_id={{.HeadId}}')
|
||||
{{ else }}
|
||||
requests.get(f'{{ .Host }}/api/model/epoch/update?model_id={{.Model.Id}}&epoch={epoch + 1}&accuracy={log["accuracy"]}&definition={{.DefId}}')
|
||||
requests.get(f'{{ .Host }}/api/model/epoch/update?model_id={{.Model.Id}}&epoch={epoch + 1}&accuracy={log["val_accuracy"]}&definition={{.DefId}}')
|
||||
{{end}}
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ import numpy as np
|
||||
|
||||
class NotifyServerCallback(tf.keras.callbacks.Callback):
|
||||
def on_epoch_end(self, epoch, log, *args, **kwargs):
|
||||
requests.get(f'{{ .Host }}/api/model/head/epoch/update?epoch={epoch + 1}&accuracy={log["accuracy"]}&head_id={{.HeadId}}')
|
||||
requests.get(f'{{ .Host }}/api/model/head/epoch/update?epoch={epoch + 1}&accuracy={log["val_accuracy"]}&head_id={{.HeadId}}')
|
||||
|
||||
|
||||
DATA_DIR = "{{ .DataDir }}"
|
||||
|
Loading…
Reference in New Issue
Block a user