Improved classification performance

This commit is contained in:
Andre Henriques 2024-05-15 05:32:49 +01:00
parent 516d1d7634
commit 652542d261
18 changed files with 211 additions and 98 deletions

View File

@ -31,7 +31,10 @@ ADD go.mod .
ADD go.sum .
ADD main.go .
ADD logic logic
ADD entrypoint.sh .
RUN go install || true
CMD ["go", "run", "."]
RUN go build .
CMD ["./entrypoint.sh"]

View File

@ -12,7 +12,7 @@ USER = "service"
[Worker]
PULLING_TIME = "500ms"
NUMBER_OF_WORKERS = 1
NUMBER_OF_WORKERS = 16
[DB]
MAX_CONNECTIONS = 600

View File

@ -23,6 +23,13 @@ services:
- db
volumes:
- "./config.toml:/app/config.toml"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
proxy-server:
image: andre-fyp-proxy
networks:

4
entrypoint.sh Executable file
View File

@ -0,0 +1,4 @@
#/bin/bash
while true; do
./fyp
done

View File

@ -16,7 +16,6 @@ import (
)
func loadBaseImage(c *Context, id string) {
// TODO handle more types than png
infile, err := os.Open(path.Join("savedData", id, "baseimage.png"))
if err != nil {
c.Logger.Errorf("Failed to read image for model with id %s\n", id)

View File

@ -4,6 +4,7 @@ import (
"errors"
"os"
"path"
"runtime/debug"
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/db_types"
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/tasks/utils"
@ -37,11 +38,19 @@ func ReadJPG(scope *op.Scope, imagePath string, channels int64) *image.Image {
return image.Scale(0, 255)
}
func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor) (order int, confidence float32, err error) {
func runModelNormal(model *BaseModel, def_id string, inputImage *tf.Tensor, data *RunnerModelData) (order int, confidence float32, err error) {
order = 0
err = nil
tf_model := tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
var tf_model *tg.Model = nil
if data.Id != nil && *data.Id == def_id {
tf_model = data.Model
} else {
tf_model = tg.LoadModel(path.Join("savedData", model.Id, "defs", def_id, "model"), []string{"serve"}, nil)
data.Model = tf_model
data.Id = &def_id
}
results := tf_model.Exec([]tf.Output{
tf_model.Op("StatefulPartitionedCall", 0),
@ -125,10 +134,15 @@ func runModelExp(base BasePack, model *BaseModel, def_id string, inputImage *tf.
return
}
func ClassifyTask(base BasePack, task Task) (err error) {
type RunnerModelData struct {
Id *string
Model *tg.Model
}
func ClassifyTask(base BasePack, task Task, data *RunnerModelData) (err error) {
defer func() {
if r := recover(); r != nil {
base.GetLogger().Error("Task failed due to", "error", r)
base.GetLogger().Error("Task failed due to", "error", r, "stack", string(debug.Stack()))
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Task failed running")
}
}()
@ -186,6 +200,8 @@ func ClassifyTask(base BasePack, task Task) (err error) {
if model.ModelType == 2 {
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
data.Model = nil
data.Id = nil
vi, confidence, err = runModelExp(base, model, def_id, inputImage)
if err != nil {
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
@ -193,7 +209,7 @@ func ClassifyTask(base BasePack, task Task) (err error) {
}
} else {
base.GetLogger().Info("Running model normal", "model", model.Id, "def", def_id)
vi, confidence, err = runModelNormal(model, def_id, inputImage)
vi, confidence, err = runModelNormal(model, def_id, inputImage, data)
if err != nil {
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Failed to run model")
return

View File

@ -1191,7 +1191,7 @@ func generateDefinition(c BasePack, model *BaseModel, target_accuracy int, numbe
}
order++
loop := max(1, int((math.Log(float64(model.Width)) / math.Log(float64(10)))))
loop := max(1, int(math.Ceil((math.Log(float64(model.Width))/math.Log(float64(10)))))+1)
for i := 0; i < loop; i++ {
_, err = def.MakeLayer(db, order, LAYER_SIMPLE_BLOCK, "")
order++
@ -1299,7 +1299,7 @@ func generateExpandableDefinition(c BasePack, model *BaseModel, target_accuracy
order++
// Create the blocks
loop := int((math.Log(float64(model.Width)) / math.Log(float64(10))))
loop := int(math.Ceil((math.Log(float64(model.Width)) / math.Log(float64(10))))) + 1
/*if model.Width < 50 && model.Height < 50 {
loop = 0

View File

@ -68,7 +68,7 @@ func handleTasksStats(handle *Handle) {
} else if task.Status < 2 {
total.Classfication_pre += 1
hours[hour].Classfication_pre += 1
} else if task.Status < 4 {
} else if task.Status < 4 || task.Status == 5 {
total.Classfication_running += 1
hours[hour].Classfication_running += 1
}

View File

@ -19,6 +19,8 @@ import (
. "git.andr3h3nriqu3s.com/andr3/fyp/logic/utils"
)
var QUEUE_SIZE = 10
/**
* Actually runs the code
*/
@ -47,17 +49,28 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
Host: config.Hostname,
}
loaded_model := RunnerModelData{
Id: nil,
Model: nil,
}
count := 0
for task := range task_channel {
logger.Info("Got task", "task", task)
task.UpdateStatusLog(base, TASK_PICKED_UP, "Runner picked up task")
if task.TaskType == int(TASK_TYPE_CLASSIFICATION) {
logger.Info("Classification Task")
if err = ClassifyTask(base, task); err != nil {
if err = ClassifyTask(base, task, &loaded_model); err != nil {
logger.Error("Classification task failed", "error", err)
}
back_channel <- index
if count == QUEUE_SIZE {
back_channel <- index
count = 0
} else {
count += 1
}
continue
} else if task.TaskType == int(TASK_TYPE_TRAINING) {
logger.Info("Training Task")
@ -65,7 +78,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
logger.Error("Failed to tain the model", "error", err)
}
back_channel <- index
if count == QUEUE_SIZE {
back_channel <- index
count = 0
} else {
count += 1
}
continue
} else if task.TaskType == int(TASK_TYPE_RETRAINING) {
logger.Info("Retraining Task")
@ -73,7 +91,12 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
logger.Error("Failed to tain the model", "error", err)
}
back_channel <- index
if count == QUEUE_SIZE {
back_channel <- index
count = 0
} else {
count += 1
}
continue
} else if task.TaskType == int(TASK_TYPE_DELETE_USER) {
logger.Warn("User deleting Task")
@ -81,13 +104,23 @@ func runner(config Config, db db.Db, task_channel chan Task, index int, back_cha
logger.Error("Failed to tain the model", "error", err)
}
back_channel <- index
if count == QUEUE_SIZE {
back_channel <- index
count = 0
} else {
count += 1
}
continue
}
logger.Error("Do not know how to route task", "task", task)
task.UpdateStatusLog(base, TASK_FAILED_RUNNING, "Do not know how to route task")
back_channel <- index
if count == QUEUE_SIZE {
back_channel <- index
count = 0
} else {
count += 1
}
}
}
@ -145,7 +178,7 @@ func handleRemoteTask(handler *Handle, base BasePack, runner_id string, task Tas
/**
* Tells the orcchestator to look at the task list from time to time
*/
func attentionSeeker(config Config, back_channel chan int) {
func attentionSeeker(config Config, db db.Db, back_channel chan int) {
logger := log.NewWithOptions(os.Stdout, log.Options{
ReportCaller: true,
ReportTimestamp: true,
@ -170,6 +203,20 @@ func attentionSeeker(config Config, back_channel chan int) {
for true {
back_channel <- 0
for {
var s struct {
Count int `json:"count(*)"`
}
err := GetDBOnce(db, &s, "tasks where stauts = 5 or status = 3")
if err != nil {
break
}
if s.Count == 0 {
break
}
time.Sleep(t)
}
time.Sleep(t)
}
}
@ -194,11 +241,16 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
}
gpu_workers := config.GpuWorker.NumberOfWorkers
def_wait, err := time.ParseDuration(config.GpuWorker.Pulling)
if err != nil {
logger.Error("Failed to load", "error", err)
return
}
logger.Info("Starting runners")
task_runners := make([]chan Task, gpu_workers)
task_runners_used := make([]bool, gpu_workers)
task_runners_used := make([]int, gpu_workers)
// One more to accomudate the Attention Seeker channel
back_channel := make(chan int, gpu_workers+1)
@ -213,12 +265,12 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
}
}()
go attentionSeeker(config, back_channel)
// go attentionSeeker(config, db, back_channel)
// Start the runners
for i := 0; i < gpu_workers; i++ {
task_runners[i] = make(chan Task, 10)
task_runners_used[i] = false
task_runners[i] = make(chan Task, QUEUE_SIZE)
task_runners_used[i] = 0
AddLocalRunner(handler, LocalRunner{
RunnerNum: i + 1,
Task: nil,
@ -226,82 +278,107 @@ func RunnerOrchestrator(db db.Db, config Config, handler *Handle) {
go runner(config, db, task_runners[i], i+1, back_channel)
}
var task_to_dispatch *Task = nil
for i := range back_channel {
if i != 0 {
if i > 0 {
logger.Info("Runner freed", "runner", i)
task_runners_used[i-1] = false
} else if i < 0 {
logger.Error("Runner died! Restarting!", "runner", i)
i = int(math.Abs(float64(i)) - 1)
task_runners_used[i] = false
go runner(config, db, task_runners[i], i+1, back_channel)
used := 0
wait := time.Nanosecond * 100
for {
out := true
for out {
select {
case i := <-back_channel:
if i != 0 {
if i > 0 {
logger.Info("Runner freed", "runner", i)
task_runners_used[i-1] = 0
used = 0
} else if i < 0 {
logger.Error("Runner died! Restarting!", "runner", i)
i = int(math.Abs(float64(i)) - 1)
task_runners_used[i] = 0
used = 0
go runner(config, db, task_runners[i], i+1, back_channel)
}
AddLocalTask(handler, int(math.Abs(float64(i))), nil)
} else if used == len(task_runners_used) {
continue
}
case <-time.After(wait):
if wait == time.Nanosecond*100 {
wait = def_wait
}
out = false
}
AddLocalTask(handler, int(math.Abs(float64(i))), nil)
}
if task_to_dispatch == nil {
var task TaskT
err := GetDBOnce(db, &task, "tasks as t "+
for {
tasks, err := GetDbMultitple[TaskT](db, "tasks as t "+
// Get depenencies
"left join tasks_dependencies as td on t.id=td.main_id "+
// Get the task that the depencey resolves to
"left join tasks as t2 on t2.id=td.dependent_id "+
"where t.status=1 "+
"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0;")
"group by t.id having count(td.id) filter (where t2.status in (0,1,2,3)) = 0 limit 20;")
if err != NotFoundError && err != nil {
log.Error("Failed to get tasks from db", "err", err)
continue
}
if err == NotFoundError {
task_to_dispatch = nil
} else {
temp := Task(task)
task_to_dispatch = &temp
}
}
if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
// TODO split tasks into cpu tasks and GPU tasks
mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
mutex.Lock()
remote_runners := handler.DataMap["runners"].(map[string]interface{})
for k, v := range remote_runners {
runner_data := v.(map[string]interface{})
runner_info := runner_data["runner_info"].(*Runner)
if runner_data["task"] != nil {
continue
}
if runner_info.UserId != task_to_dispatch.UserId {
continue
}
go handleRemoteTask(handler, base, k, *task_to_dispatch)
task_to_dispatch = nil
if err == NotFoundError || len(tasks) == 0 {
break
}
mutex.Unlock()
}
for _, task_to_dispatch := range tasks {
ttd := Task(*task_to_dispatch)
if task_to_dispatch != nil && task_to_dispatch.TaskType != int(TASK_TYPE_DELETE_USER) {
// TODO split tasks into cpu tasks and GPU tasks
mutex := handler.DataMap["runners_mutex"].(*sync.Mutex)
mutex.Lock()
remote_runners := handler.DataMap["runners"].(map[string]interface{})
if task_to_dispatch != nil {
for i := 0; i < len(task_runners_used); i += 1 {
if !task_runners_used[i] {
task_runners[i] <- *task_to_dispatch
task_runners_used[i] = true
AddLocalTask(handler, i+1, task_to_dispatch)
task_to_dispatch = nil
for k, v := range remote_runners {
runner_data := v.(map[string]interface{})
runner_info := runner_data["runner_info"].(*Runner)
if runner_data["task"] != nil {
continue
}
if runner_info.UserId != task_to_dispatch.UserId {
continue
}
go handleRemoteTask(handler, base, k, ttd)
task_to_dispatch = nil
break
}
mutex.Unlock()
}
used = 0
if task_to_dispatch != nil {
for i := 0; i < len(task_runners_used); i += 1 {
if task_runners_used[i] <= QUEUE_SIZE {
ttd.UpdateStatusLog(base, TASK_QUEUED, "Runner picked up task")
task_runners[i] <- ttd
task_runners_used[i] += 1
AddLocalTask(handler, i+1, &ttd)
task_to_dispatch = nil
wait = time.Nanosecond * 100
break
} else {
used += 1
}
}
}
if used == len(task_runners_used) {
break
}
}
}
if used == len(task_runners_used) {
break
}
}
}
}

View File

@ -50,6 +50,7 @@ const (
TASK_PREPARING = 0
TASK_TODO = 1
TASK_PICKED_UP = 2
TASK_QUEUED = 5
TASK_RUNNING = 3
TASK_DONE = 4
)

View File

@ -102,7 +102,7 @@ func (c *Config) Cleanup(db db.Db) {
failLog(err)
_, err = db.Exec("update models set status=$1 where status=$2", FAILED_PREPARING, PREPARING)
failLog(err)
_, err = db.Exec("update tasks set status=$1 where status=$2", TASK_TODO, TASK_PICKED_UP)
_, err = db.Exec("update tasks set status=$1 where status=$2 or status=$3", TASK_TODO, TASK_PICKED_UP, TASK_QUEUED)
failLog(err)
tasks, err := GetDbMultitple[Task](db, "tasks where status=$1", TASK_RUNNING)

View File

@ -1,6 +1,6 @@
events {
worker_connections 1024;
worker_connections 2024;
}
http {

2
run.sh
View File

@ -1,2 +1,2 @@
#!/bin/bash
podman run --rm --network host --gpus all --name fyp-server -it -v $(pwd):/app -e "TERM=xterm-256color" fyp-server bash
podman run --network host --gpus all --replace --name fyp-server --ulimit=nofile=100000:100000 -it -v $(pwd):/app -e "TERM=xterm-256color" --restart=always andre-fyp-server

View File

@ -59,7 +59,6 @@ create table if not exists model_definition (
accuracy real default 0,
target_accuracy integer not null,
epoch integer default 0,
-- TODO add max epoch
-- 1: Pre Init
-- 2: Init
-- 3: Training
@ -78,7 +77,7 @@ create table if not exists model_definition_layer (
-- 1: input
-- 2: dense
-- 3: flatten
-- TODO add conv
-- 4: block
layer_type integer not null,
-- ei 28,28,1
-- a 28x28 grayscale image
@ -102,7 +101,6 @@ create table if not exists exp_model_head (
accuracy real default 0,
-- TODO add max epoch
-- 1: Pre Init
-- 2: Init
-- 3: Training

View File

@ -143,6 +143,15 @@ def addBlock(
model.add(layers.Dropout(0.4))
return model
def resblock(x, kernelsize = 3, filters = 128):
fx = layers.Conv2D(filters, kernelsize, activation='relu', padding='same')(x)
fx = layers.BatchNormalization()(fx)
fx = layers.Conv2D(filters, kernelsize, padding='same')(fx)
out = layers.Add()([x,fx])
out = layers.ReLU()(out)
out = layers.BatchNormalization()(out)
return out
{{ if .LoadPrev }}
model = tf.keras.saving.load_model('{{.LastModelRunPath}}')

View File

@ -42,6 +42,7 @@
import 'src/styles/forms.css';
import { notificationStore } from 'src/lib/NotificationsStore.svelte';
import Spinner from 'src/lib/Spinner.svelte';
let model: Promise<Model> = $state(new Promise(() => {}));
let _model: Model | undefined = $state(undefined);
@ -188,7 +189,6 @@
<h1 class="text-center">
{m.name}
</h1>
<!-- TODO improve message -->
<h2 class="text-center">Failed to prepare model</h2>
<DeleteModel model={m} />
@ -206,8 +206,7 @@
{:else if m.status == 3}
<BaseModelInfo model={m} />
<div class="card">
<!-- TODO improve this -->
Processing zip file...
Processing zip file... <Spinner />
</div>
{:else if m.status == -3 || m.status == -4}
<BaseModelInfo model={m} />

View File

@ -70,16 +70,16 @@
<button class="tab" class:selected={isActive('upload')} onclick={setActive('upload')}>
Upload
</button>
<button
<!--button
class="tab"
class:selected={isActive('create-class')}
onclick={setActive('create-class')}
>
Create Class
</button>
<button class="tab" class:selected={isActive('api')} onclick={setActive('api')}>
</button-->
<!--button class="tab" class:selected={isActive('api')} onclick={setActive('api')}>
Api
</button>
</button-->
</div>
<div class="content" class:selected={isActive('upload')}>
<form onsubmit={preventDefault(uploadZip)}>
@ -111,10 +111,10 @@
{/if}
</form>
</div>
<div class="content" class:selected={isActive('create-class')}>
<!--div class="content" class:selected={isActive('create-class')}>
<ModelTable {classes} {model} {onreload} />
</div>
<div class="content" class:selected={isActive('api')}>TODO</div>
</div-->
<!--div class="content" class:selected={isActive('api')}>TODO</div-->
</Tabs>
<div class="tabs"></div>
{:else}
@ -122,7 +122,7 @@
{#if numberOfInvalidImages > 0}
<p class="danger">
There are images {numberOfInvalidImages} that were loaded that do not have the correct format.
These images will be delete when the model trains.
These images will be deleted when the model trains.
</p>
{/if}
<ModelTable {classes} {model} {onreload} />

View File

@ -54,7 +54,7 @@
{#if number_of_invalid_images > 0}
<p class="danger">
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
These images will be delete when the model trains.
These images will be deleted when the model trains.
</p>
{/if}
<!-- TODO expading mode -->
@ -101,14 +101,14 @@
<h2>To train the model please provide data to the model first</h2>
{/if}
</form>
{:else}
{:else if ![4, 6, 7].includes(model.status)}
<form class:submitted onsubmit={submitRetrain}>
{#if has_data}
<h2>This model has new classes and can be expanded</h2>
{#if number_of_invalid_images > 0}
<p class="danger">
There are images {number_of_invalid_images} that were loaded that do not have the correct format.DeleteZip
These images will be delete when the model trains.
These images will be deleted when the model trains.
</p>
{/if}
<button> Retrain </button>