Improved classification performance
This commit is contained in:
parent
516d1d7634
commit
652542d261
@ -31,7 +31,10 @@ ADD go.mod .
|
|||||||
ADD go.sum .
|
ADD go.sum .
|
||||||
ADD main.go .
|
ADD main.go .
|
||||||
ADD logic logic
|
ADD logic logic
|
||||||
|
ADD entrypoint.sh .
|
||||||
|
|
||||||
RUN go install || true
|
RUN go install || true
|
||||||
|
|
||||||
CMD ["go", "run", "."]
|
RUN go build .
|
||||||
|
|
||||||
|
CMD ["./entrypoint.sh"]
|
||||||
|
@ -12,7 +12,7 @@ USER = "service"
|
|||||||
|
|
||||||
[Worker]
|
[Worker]
|
||||||
PULLING_TIME = "500ms"
|
PULLING_TIME = "500ms"
|
||||||
NUMBER_OF_WORKERS = 1
|
NUMBER_OF_WORKERS = 16
|
||||||
|
|
||||||
[DB]
|
[DB]
|
||||||
MAX_CONNECTIONS = 600
|
MAX_CONNECTIONS = 600
|
||||||
|
@ -23,6 +23,13 @@ services:
|
|||||||
- db
|
- db
|
||||||
volumes:
|
volumes:
|
||||||
- "./config.toml:/app/config.toml"
|
- "./config.toml:/app/config.toml"
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
proxy-server:
|
proxy-server:
|
||||||
image: andre-fyp-proxy
|
image: andre-fyp-proxy
|
||||||
networks:
|
networks:
|
||||||
|
4
entrypoint.sh
Executable file
4
entrypoint.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#/bin/bash
|
||||||
|
while true; do
|
||||||
|
./fyp
|
||||||
|
done
|
@ -16,7 +16,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func loadBaseImage(c *Context, id string) {
|
func loadBaseImage(c *Context, id string) {
|
||||||
// TODO handle more types than png
|
|
||||||
infile, err := os.Open(path.Join("savedData", id, "baseimage.png"))
|
infile, err := os.Open(path.Join("savedData", id, "baseimage.png"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.Logger.Errorf("Failed to read image for model with id %s\n", id)
|
c.Logger.Errorf("Failed to read image for model with id %s\n", id)
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"runtime/debug"
|
||||||
|
|
||||||
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/db_types"
|
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/db_types"
|
||||||
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/tasks/utils"
|
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/tasks/utils"
|
||||||
@ -37,11 +38,19 @@ func ReadJPG(scope *op.Scope, imagePath string, channels int64) *image.Image {
|
|||||||
return image.Scale(0, 255)
|
return image.Scale(0, 255)
|
||||||
}
|
}
|
||||||
|
|
||||||
func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor) (order int, confidence float32, err error) {
|
func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor, data *RunnerModelData) (order int, confidence float32, err error) {
|
||||||
order = 0
|
order = 0
|
||||||
err = nil
|
err = nil
|
||||||
|
|
||||||
tf_model := tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
|
var tf_model *tg.Model = nil
|
||||||
|
|
||||||
|
if data.Id != nil && *data.Id == def_id {
|
||||||
|
tf_model = data.Model
|
||||||
|
} else {
|
||||||
|
tf_model = tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
|
||||||
|
data.Model = tf_model
|
||||||
|
data.Id = &def_id
|
||||||
|
}
|
||||||
|
|
||||||
results := tf_model.Exec([]tf.Output{
|
results := tf_model.Exec([]tf.Output{
|
||||||
tf_model.Op("StatefulPartitionedCall", 0),
|
tf_model.Op("StatefulPartitionedCall", 0),
|
||||||
@ -125,10 +134,15 @@ func runModelExp(base BasePack, model *BaseModel, def_id string, inputImage *tf.
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func ClassifyTask(base BasePack, task Task) (err error) {
|
type RunnerModelData struct {
|
||||||
|
Id *string
|
||||||
|
Model *tg.Model
|
||||||
|
}
|
||||||
|
|
||||||
|
func ClassifyTask(base BasePack, task Task, data *RunnerModelData) (err error) {
|
||||||
defer func() {
|
defer func() {
|
||||||
if r := recover(); r != nil {
|
if r := recover(); r != nil {
|
||||||
base.GetLogger().Error("Task failed due to", "error", r)
|
base.GetLogger().Error("Task failed due to", "error", r, "stack", string(debug.Stack()))
|
||||||
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Task failed running")
|
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Task failed running")
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@ -186,6 +200,8 @@ func ClassifyTask(base BasePack, task Task) (err error) {
|
|||||||
|
|
||||||
if model.ModelType == 2 {
|
if model.ModelType == 2 {
|
||||||
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
|
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
|
||||||
|
data.Model = nil
|
||||||
|
data.Id = nil
|
||||||
vi, confidence, err = runModelExp(base, model, def_id, inputImage)
|
vi, confidence, err = runModelExp(base, model, def_id, inputImage)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
|
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
|
||||||
@ -193,7 +209,7 @@ func ClassifyTask(base BasePack, task Task) (err error) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
|
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
|
||||||
vi, confidence, err = runModelNormal(model, def_id, inputImage)
|
vi, confidence, err = runModelNormal(model, def_id, inputImage, data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
|
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
|
||||||
return
|
return
|
||||||
|
@ -1191,7 +1191,7 @@ func generateDefinition(c BasePack, model *BaseModel, target_accuracy int, numbe
|
|||||||
}
|
}
|
||||||
order++
|
order++
|
||||||
|
|
||||||
loop := max(1, int((math.Log(float64(model.Width)) / math.Log(float64(10)))))
|
loop := max(1, int(math.Ceil((math.Log(float64(model.Width))/math.Log(float64(10)))))+1)
|
||||||
for i := 0; i < loop; i++ {
|
for i := 0; i < loop; i++ {
|
||||||
_, err = def.MakeLayer(db, order, LAYER_SIMPLE_BLOCK, "")
|
_, err = def.MakeLayer(db, order, LAYER_SIMPLE_BLOCK, "")
|
||||||
order++
|
order++
|
||||||
@ -1299,7 +1299,7 @@ func generateExpandableDefinition(c BasePack, model *BaseModel, target_accuracy
|
|||||||
order++
|
order++
|
||||||
|
|
||||||
// Create the blocks
|
// Create the blocks
|
||||||
loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
|
loop := int(math.Ceil((math.Log(float64(model.Width)) / math.Log(float64(10))))) + 1
|
||||||
|
|
||||||
/*if model.Width < 50 && model.Height < 50 {
|
/*if model.Width < 50 && model.Height < 50 {
|
||||||
loop = 0
|
loop = 0
|
||||||
|
@ -68,7 +68,7 @@ func handleTasksStats(handle *Handle) {
|
|||||||
} else if task.Status < 2 {
|
} else if task.Status < 2 {
|
||||||
total.Classfication_pre += 1
|
total.Classfication_pre += 1
|
||||||
hours[hour].Classfication_pre += 1
|
hours[hour].Classfication_pre += 1
|
||||||
} else if task.Status < 4 {
|
} else if task.Status < 4 || task.Status == 5 {
|
||||||
total.Classfication_running += 1
|
total.Classfication_running += 1
|
||||||
hours[hour].Classfication_running += 1
|
hours[hour].Classfication_running += 1
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,8 @@ import (
|
|||||||
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/utils"
|
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var QUEUE_SIZE = 10
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Actually runs the code
|
* Actually runs the code
|
||||||
*/
|
*/
|
||||||
@ -47,17 +49,28 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
|
|||||||
Host: config.Hostname,
|
Host: config.Hostname,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
loaded_model := RunnerModelData{
|
||||||
|
Id: nil,
|
||||||
|
Model: nil,
|
||||||
|
}
|
||||||
|
|
||||||
|
count := 0
|
||||||
for task := range task_channel {
|
for task := range task_channel {
|
||||||
logger.Info("Got task", "task", task)
|
logger.Info("Got task", "task", task)
|
||||||
task.UpdateStatusLog(base, TASK_PICKED_UP, "Runner picked up task")
|
task.UpdateStatusLog(base, TASK_PICKED_UP, "Runner picked up task")
|
||||||
|
|
||||||
if task.TaskType == int(TASK_TYPE_CLASSIFICATION) {
|
if task.TaskType == int(TASK_TYPE_CLASSIFICATION) {
|
||||||
logger.Info("Classification Task")
|
logger.Info("Classification Task")
|
||||||
if err = ClassifyTask(base, task); err != nil {
|
if err = ClassifyTask(base, task, &loaded_model); err != nil {
|
||||||
logger.Error("Classification task failed", "error", err)
|
logger.Error("Classification task failed", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
back_channel <- index
|
if count == QUEUE_SIZE {
|
||||||
|
back_channel <- index
|
||||||
|
count = 0
|
||||||
|
} else {
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
} else if task.TaskType == int(TASK_TYPE_TRAINING) {
|
} else if task.TaskType == int(TASK_TYPE_TRAINING) {
|
||||||
logger.Info("Training Task")
|
logger.Info("Training Task")
|
||||||
@ -65,7 +78,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
|
|||||||
logger.Error("Failed to tain the model", "error", err)
|
logger.Error("Failed to tain the model", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
back_channel <- index
|
if count == QUEUE_SIZE {
|
||||||
|
back_channel <- index
|
||||||
|
count = 0
|
||||||
|
} else {
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
} else if task.TaskType == int(TASK_TYPE_RETRAINING) {
|
} else if task.TaskType == int(TASK_TYPE_RETRAINING) {
|
||||||
logger.Info("Retraining Task")
|
logger.Info("Retraining Task")
|
||||||
@ -73,7 +91,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
|
|||||||
logger.Error("Failed to tain the model", "error", err)
|
logger.Error("Failed to tain the model", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
back_channel <- index
|
if count == QUEUE_SIZE {
|
||||||
|
back_channel <- index
|
||||||
|
count = 0
|
||||||
|
} else {
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
} else if task.TaskType == int(TASK_TYPE_DELETE_USER) {
|
} else if task.TaskType == int(TASK_TYPE_DELETE_USER) {
|
||||||
logger.Warn("User deleting Task")
|
logger.Warn("User deleting Task")
|
||||||
@ -81,13 +104,23 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
|
|||||||
logger.Error("Failed to tain the model", "error", err)
|
logger.Error("Failed to tain the model", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
back_channel <- index
|
if count == QUEUE_SIZE {
|
||||||
|
back_channel <- index
|
||||||
|
count = 0
|
||||||
|
} else {
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Error("Do not know how to route task", "task", task)
|
logger.Error("Do not know how to route task", "task", task)
|
||||||
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Do not know how to route task")
|
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Do not know how to route task")
|
||||||
back_channel <- index
|
if count == QUEUE_SIZE {
|
||||||
|
back_channel <- index
|
||||||
|
count = 0
|
||||||
|
} else {
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,7 +178,7 @@ func handleRemoteTask(handler *Handle, base BasePack, runner_id string, task Tas
|
|||||||
/**
|
/**
|
||||||
* Tells the orcchestator to look at the task list from time to time
|
* Tells the orcchestator to look at the task list from time to time
|
||||||
*/
|
*/
|
||||||
func attentionSeeker(config Config, back_channel chan int) {
|
func attentionSeeker(config Config, db db.Db, back_channel chan int) {
|
||||||
logger := log.NewWithOptions(os.Stdout, log.Options{
|
logger := log.NewWithOptions(os.Stdout, log.Options{
|
||||||
ReportCaller: true,
|
ReportCaller: true,
|
||||||
ReportTimestamp: true,
|
ReportTimestamp: true,
|
||||||
@ -170,6 +203,20 @@ func attentionSeeker(config Config, back_channel chan int) {
|
|||||||
for true {
|
for true {
|
||||||
back_channel <- 0
|
back_channel <- 0
|
||||||
|
|
||||||
|
for {
|
||||||
|
var s struct {
|
||||||
|
Count int `json:"count(*)"`
|
||||||
|
}
|
||||||
|
err := GetDBOnce(db, &s, "tasks where stauts = 5 or status = 3")
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if s.Count == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(t)
|
||||||
|
}
|
||||||
|
|
||||||
time.Sleep(t)
|
time.Sleep(t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -194,11 +241,16 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
gpu_workers := config.GpuWorker.NumberOfWorkers
|
gpu_workers := config.GpuWorker.NumberOfWorkers
|
||||||
|
def_wait, err := time.ParseDuration(config.GpuWorker.Pulling)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("Failed to load", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
logger.Info("Starting runners")
|
logger.Info("Starting runners")
|
||||||
|
|
||||||
task_runners := make([]chan Task, gpu_workers)
|
task_runners := make([]chan Task, gpu_workers)
|
||||||
task_runners_used := make([]bool, gpu_workers)
|
task_runners_used := make([]int, gpu_workers)
|
||||||
// One more to accomudate the Attention Seeker channel
|
// One more to accomudate the Attention Seeker channel
|
||||||
back_channel := make(chan int, gpu_workers+1)
|
back_channel := make(chan int, gpu_workers+1)
|
||||||
|
|
||||||
@ -213,12 +265,12 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
go attentionSeeker(config, back_channel)
|
// go attentionSeeker(config, db, back_channel)
|
||||||
|
|
||||||
// Start the runners
|
// Start the runners
|
||||||
for i := 0; i < gpu_workers; i++ {
|
for i := 0; i < gpu_workers; i++ {
|
||||||
task_runners[i] = make(chan Task, 10)
|
task_runners[i] = make(chan Task, QUEUE_SIZE)
|
||||||
task_runners_used[i] = false
|
task_runners_used[i] = 0
|
||||||
AddLocalRunner(handler, LocalRunner{
|
AddLocalRunner(handler, LocalRunner{
|
||||||
RunnerNum: i + 1,
|
RunnerNum: i + 1,
|
||||||
Task: nil,
|
Task: nil,
|
||||||
@ -226,82 +278,107 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
|
|||||||
go runner(config, db, task_runners[i], i+1, back_channel)
|
go runner(config, db, task_runners[i], i+1, back_channel)
|
||||||
}
|
}
|
||||||
|
|
||||||
var task_to_dispatch *Task = nil
|
used := 0
|
||||||
|
wait := time.Nanosecond * 100
|
||||||
for i := range back_channel {
|
for {
|
||||||
|
out := true
|
||||||
if i != 0 {
|
for out {
|
||||||
if i > 0 {
|
select {
|
||||||
logger.Info("Runner freed", "runner", i)
|
case i := <-back_channel:
|
||||||
task_runners_used[i-1] = false
|
if i != 0 {
|
||||||
} else if i < 0 {
|
if i > 0 {
|
||||||
logger.Error("Runner died! Restarting!", "runner", i)
|
logger.Info("Runner freed", "runner", i)
|
||||||
i = int(math.Abs(float64(i)) - 1)
|
task_runners_used[i-1] = 0
|
||||||
task_runners_used[i] = false
|
used = 0
|
||||||
go runner(config, db, task_runners[i], i+1, back_channel)
|
} else if i < 0 {
|
||||||
|
logger.Error("Runner died! Restarting!", "runner", i)
|
||||||
|
i = int(math.Abs(float64(i)) - 1)
|
||||||
|
task_runners_used[i] = 0
|
||||||
|
used = 0
|
||||||
|
go runner(config, db, task_runners[i], i+1, back_channel)
|
||||||
|
}
|
||||||
|
AddLocalTask(handler, int(math.Abs(float64(i))), nil)
|
||||||
|
} else if used == len(task_runners_used) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case <-time.After(wait):
|
||||||
|
if wait == time.Nanosecond*100 {
|
||||||
|
wait = def_wait
|
||||||
|
}
|
||||||
|
out = false
|
||||||
}
|
}
|
||||||
AddLocalTask(handler, int(math.Abs(float64(i))), nil)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if task_to_dispatch == nil {
|
for {
|
||||||
var task TaskT
|
tasks, err := GetDbMultitple[TaskT](db, "tasks as t "+
|
||||||
err := GetDBOnce(db, &task, "tasks as t "+
|
|
||||||
// Get depenencies
|
// Get depenencies
|
||||||
"left join tasks_dependencies as td on t.id=td.main_id "+
|
"left join tasks_dependencies as td on t.id=td.main_id "+
|
||||||
// Get the task that the depencey resolves to
|
// Get the task that the depencey resolves to
|
||||||
"left join tasks as t2 on t2.id=td.dependent_id "+
|
"left join tasks as t2 on t2.id=td.dependent_id "+
|
||||||
"where t.status=1 "+
|
"where t.status=1 "+
|
||||||
"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0;")
|
"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0 limit 20;")
|
||||||
if err != NotFoundError && err != nil {
|
if err != NotFoundError && err != nil {
|
||||||
log.Error("Failed to get tasks from db", "err", err)
|
log.Error("Failed to get tasks from db", "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err == NotFoundError {
|
if err == NotFoundError || len(tasks) == 0 {
|
||||||
task_to_dispatch = nil
|
|
||||||
} else {
|
|
||||||
temp := Task(task)
|
|
||||||
task_to_dispatch = &temp
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
|
|
||||||
// TODO split tasks into cpu tasks and GPU tasks
|
|
||||||
mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
|
|
||||||
mutex.Lock()
|
|
||||||
remote_runners := handler.DataMap["runners"].(map[string]interface{})
|
|
||||||
|
|
||||||
for k, v := range remote_runners {
|
|
||||||
runner_data := v.(map[string]interface{})
|
|
||||||
runner_info := runner_data["runner_info"].(*Runner)
|
|
||||||
|
|
||||||
if runner_data["task"] != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if runner_info.UserId != task_to_dispatch.UserId {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
go handleRemoteTask(handler, base, k, *task_to_dispatch)
|
|
||||||
task_to_dispatch = nil
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex.Unlock()
|
for _, task_to_dispatch := range tasks {
|
||||||
}
|
ttd := Task(*task_to_dispatch)
|
||||||
|
if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
|
||||||
|
// TODO split tasks into cpu tasks and GPU tasks
|
||||||
|
mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
|
||||||
|
mutex.Lock()
|
||||||
|
remote_runners := handler.DataMap["runners"].(map[string]interface{})
|
||||||
|
|
||||||
if task_to_dispatch != nil {
|
for k, v := range remote_runners {
|
||||||
for i := 0; i < len(task_runners_used); i += 1 {
|
runner_data := v.(map[string]interface{})
|
||||||
if !task_runners_used[i] {
|
runner_info := runner_data["runner_info"].(*Runner)
|
||||||
task_runners[i] <- *task_to_dispatch
|
|
||||||
task_runners_used[i] = true
|
if runner_data["task"] != nil {
|
||||||
AddLocalTask(handler, i+1, task_to_dispatch)
|
continue
|
||||||
task_to_dispatch = nil
|
}
|
||||||
|
|
||||||
|
if runner_info.UserId != task_to_dispatch.UserId {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
go handleRemoteTask(handler, base, k, ttd)
|
||||||
|
task_to_dispatch = nil
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
used = 0
|
||||||
|
if task_to_dispatch != nil {
|
||||||
|
for i := 0; i < len(task_runners_used); i += 1 {
|
||||||
|
if task_runners_used[i] <= QUEUE_SIZE {
|
||||||
|
ttd.UpdateStatusLog(base, TASK_QUEUED, "Runner picked up task")
|
||||||
|
task_runners[i] <- ttd
|
||||||
|
task_runners_used[i] += 1
|
||||||
|
AddLocalTask(handler, i+1, &ttd)
|
||||||
|
task_to_dispatch = nil
|
||||||
|
wait = time.Nanosecond * 100
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
used += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if used == len(task_runners_used) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
if used == len(task_runners_used) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@ const (
|
|||||||
TASK_PREPARING = 0
|
TASK_PREPARING = 0
|
||||||
TASK_TODO = 1
|
TASK_TODO = 1
|
||||||
TASK_PICKED_UP = 2
|
TASK_PICKED_UP = 2
|
||||||
|
TASK_QUEUED = 5
|
||||||
TASK_RUNNING = 3
|
TASK_RUNNING = 3
|
||||||
TASK_DONE = 4
|
TASK_DONE = 4
|
||||||
)
|
)
|
||||||
|
@ -102,7 +102,7 @@ func (c *Config) Cleanup(db db.Db) {
|
|||||||
failLog(err)
|
failLog(err)
|
||||||
_, err = db.Exec("update models set status=$1 where status=$2", FAILED_PREPARING, PREPARING)
|
_, err = db.Exec("update models set status=$1 where status=$2", FAILED_PREPARING, PREPARING)
|
||||||
failLog(err)
|
failLog(err)
|
||||||
_, err = db.Exec("update tasks set status=$1 where status=$2", TASK_TODO, TASK_PICKED_UP)
|
_, err = db.Exec("update tasks set status=$1 where status=$2 or status=$3", TASK_TODO, TASK_PICKED_UP, TASK_QUEUED)
|
||||||
failLog(err)
|
failLog(err)
|
||||||
|
|
||||||
tasks, err := GetDbMultitple[Task](db, "tasks where status=$1", TASK_RUNNING)
|
tasks, err := GetDbMultitple[Task](db, "tasks where status=$1", TASK_RUNNING)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
events {
|
events {
|
||||||
worker_connections 1024;
|
worker_connections 2024;
|
||||||
}
|
}
|
||||||
|
|
||||||
http {
|
http {
|
||||||
|
2
run.sh
2
run.sh
@ -1,2 +1,2 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
podman run --rm --network host --gpus all --name fyp-server -it -v $(pwd):/app -e "TERM=xterm-256color" fyp-server bash
|
podman run --network host --gpus all --replace --name fyp-server --ulimit=nofile=100000:100000 -it -v $(pwd):/app -e "TERM=xterm-256color" --restart=always andre-fyp-server
|
||||||
|
@ -59,7 +59,6 @@ create table if not exists model_definition (
|
|||||||
accuracy real default 0,
|
accuracy real default 0,
|
||||||
target_accuracy integer not null,
|
target_accuracy integer not null,
|
||||||
epoch integer default 0,
|
epoch integer default 0,
|
||||||
-- TODO add max epoch
|
|
||||||
-- 1: Pre Init
|
-- 1: Pre Init
|
||||||
-- 2: Init
|
-- 2: Init
|
||||||
-- 3: Training
|
-- 3: Training
|
||||||
@ -78,7 +77,7 @@ create table if not exists model_definition_layer (
|
|||||||
-- 1: input
|
-- 1: input
|
||||||
-- 2: dense
|
-- 2: dense
|
||||||
-- 3: flatten
|
-- 3: flatten
|
||||||
-- TODO add conv
|
-- 4: block
|
||||||
layer_type integer not null,
|
layer_type integer not null,
|
||||||
-- ei 28,28,1
|
-- ei 28,28,1
|
||||||
-- a 28x28 grayscale image
|
-- a 28x28 grayscale image
|
||||||
@ -102,7 +101,6 @@ create table if not exists exp_model_head (
|
|||||||
|
|
||||||
accuracy real default 0,
|
accuracy real default 0,
|
||||||
|
|
||||||
-- TODO add max epoch
|
|
||||||
-- 1: Pre Init
|
-- 1: Pre Init
|
||||||
-- 2: Init
|
-- 2: Init
|
||||||
-- 3: Training
|
-- 3: Training
|
||||||
|
@ -143,6 +143,15 @@ def addBlock(
|
|||||||
model.add(layers.Dropout(0.4))
|
model.add(layers.Dropout(0.4))
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
def resblock(x, kernelsize = 3, filters = 128):
|
||||||
|
fx = layers.Conv2D(filters, kernelsize, activation='relu', padding='same')(x)
|
||||||
|
fx = layers.BatchNormalization()(fx)
|
||||||
|
fx = layers.Conv2D(filters, kernelsize, padding='same')(fx)
|
||||||
|
out = layers.Add()([x,fx])
|
||||||
|
out = layers.ReLU()(out)
|
||||||
|
out = layers.BatchNormalization()(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
{{ if .LoadPrev }}
|
{{ if .LoadPrev }}
|
||||||
model = tf.keras.saving.load_model('{{.LastModelRunPath}}')
|
model = tf.keras.saving.load_model('{{.LastModelRunPath}}')
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
|
|
||||||
import 'src/styles/forms.css';
|
import 'src/styles/forms.css';
|
||||||
import { notificationStore } from 'src/lib/NotificationsStore.svelte';
|
import { notificationStore } from 'src/lib/NotificationsStore.svelte';
|
||||||
|
import Spinner from 'src/lib/Spinner.svelte';
|
||||||
|
|
||||||
let model: Promise<Model> = $state(new Promise(() => {}));
|
let model: Promise<Model> = $state(new Promise(() => {}));
|
||||||
let _model: Model | undefined = $state(undefined);
|
let _model: Model | undefined = $state(undefined);
|
||||||
@ -188,7 +189,6 @@
|
|||||||
<h1 class="text-center">
|
<h1 class="text-center">
|
||||||
{m.name}
|
{m.name}
|
||||||
</h1>
|
</h1>
|
||||||
<!-- TODO improve message -->
|
|
||||||
<h2 class="text-center">Failed to prepare model</h2>
|
<h2 class="text-center">Failed to prepare model</h2>
|
||||||
|
|
||||||
<DeleteModel model={m} />
|
<DeleteModel model={m} />
|
||||||
@ -206,8 +206,7 @@
|
|||||||
{:else if m.status == 3}
|
{:else if m.status == 3}
|
||||||
<BaseModelInfo model={m} />
|
<BaseModelInfo model={m} />
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<!-- TODO improve this -->
|
Processing zip file... <Spinner />
|
||||||
Processing zip file...
|
|
||||||
</div>
|
</div>
|
||||||
{:else if m.status == -3 || m.status == -4}
|
{:else if m.status == -3 || m.status == -4}
|
||||||
<BaseModelInfo model={m} />
|
<BaseModelInfo model={m} />
|
||||||
|
@ -70,16 +70,16 @@
|
|||||||
<button class="tab" class:selected={isActive('upload')} onclick={setActive('upload')}>
|
<button class="tab" class:selected={isActive('upload')} onclick={setActive('upload')}>
|
||||||
Upload
|
Upload
|
||||||
</button>
|
</button>
|
||||||
<button
|
<!--button
|
||||||
class="tab"
|
class="tab"
|
||||||
class:selected={isActive('create-class')}
|
class:selected={isActive('create-class')}
|
||||||
onclick={setActive('create-class')}
|
onclick={setActive('create-class')}
|
||||||
>
|
>
|
||||||
Create Class
|
Create Class
|
||||||
</button>
|
</button-->
|
||||||
<button class="tab" class:selected={isActive('api')} onclick={setActive('api')}>
|
<!--button class="tab" class:selected={isActive('api')} onclick={setActive('api')}>
|
||||||
Api
|
Api
|
||||||
</button>
|
</button-->
|
||||||
</div>
|
</div>
|
||||||
<div class="content" class:selected={isActive('upload')}>
|
<div class="content" class:selected={isActive('upload')}>
|
||||||
<form onsubmit={preventDefault(uploadZip)}>
|
<form onsubmit={preventDefault(uploadZip)}>
|
||||||
@ -111,10 +111,10 @@
|
|||||||
{/if}
|
{/if}
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
<div class="content" class:selected={isActive('create-class')}>
|
<!--div class="content" class:selected={isActive('create-class')}>
|
||||||
<ModelTable {classes} {model} {onreload} />
|
<ModelTable {classes} {model} {onreload} />
|
||||||
</div>
|
</div-->
|
||||||
<div class="content" class:selected={isActive('api')}>TODO</div>
|
<!--div class="content" class:selected={isActive('api')}>TODO</div-->
|
||||||
</Tabs>
|
</Tabs>
|
||||||
<div class="tabs"></div>
|
<div class="tabs"></div>
|
||||||
{:else}
|
{:else}
|
||||||
@ -122,7 +122,7 @@
|
|||||||
{#if numberOfInvalidImages > 0}
|
{#if numberOfInvalidImages > 0}
|
||||||
<p class="danger">
|
<p class="danger">
|
||||||
There are images {numberOfInvalidImages} that were loaded that do not have the correct format.
|
There are images {numberOfInvalidImages} that were loaded that do not have the correct format.
|
||||||
These images will be delete when the model trains.
|
These images will be deleted when the model trains.
|
||||||
</p>
|
</p>
|
||||||
{/if}
|
{/if}
|
||||||
<ModelTable {classes} {model} {onreload} />
|
<ModelTable {classes} {model} {onreload} />
|
||||||
|
@ -54,7 +54,7 @@
|
|||||||
{#if number_of_invalid_images > 0}
|
{#if number_of_invalid_images > 0}
|
||||||
<p class="danger">
|
<p class="danger">
|
||||||
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
|
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
|
||||||
These images will be delete when the model trains.
|
These images will be deleted when the model trains.
|
||||||
</p>
|
</p>
|
||||||
{/if}
|
{/if}
|
||||||
<!-- TODO expading mode -->
|
<!-- TODO expading mode -->
|
||||||
@ -101,14 +101,14 @@
|
|||||||
<h2>To train the model please provide data to the model first</h2>
|
<h2>To train the model please provide data to the model first</h2>
|
||||||
{/if}
|
{/if}
|
||||||
</form>
|
</form>
|
||||||
{:else}
|
{:else if ![4, 6, 7].includes(model.status)}
|
||||||
<form class:submitted onsubmit={submitRetrain}>
|
<form class:submitted onsubmit={submitRetrain}>
|
||||||
{#if has_data}
|
{#if has_data}
|
||||||
<h2>This model has new classes and can be expanded</h2>
|
<h2>This model has new classes and can be expanded</h2>
|
||||||
{#if number_of_invalid_images > 0}
|
{#if number_of_invalid_images > 0}
|
||||||
<p class="danger">
|
<p class="danger">
|
||||||
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
|
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
|
||||||
These images will be delete when the model trains.
|
These images will be deleted when the model trains.
|
||||||
</p>
|
</p>
|
||||||
{/if}
|
{/if}
|
||||||
<button> Retrain </button>
|
<button> Retrain </button>
|
||||||
|
Loading…
Reference in New Issue
Block a user