#include "twn_draw_c.h"
#include "twn_draw.h"
#include "twn_engine_context_c.h"
#include "twn_util.h"
#include "twn_workers_c.h"
#include "twn_textures_c.h"

#define FAST_OBJ_IMPLEMENTATION
#define FAST_OBJ_REALLOC SDL_realloc
#define FAST_OBJ_FREE SDL_free
#include <fast_obj.h>
#include <stb_ds.h>
#include <physfs.h>
#include <physfsrwops.h>
#include <SDL2/SDL.h>

/* TODO: it might make sense to have a separate path for really small models, collecting them together */

static struct ModelCacheItem {
    char *key;
    struct ModelCacheItemValue {
        /* UncoloredSpaceTriangle to use indices against */
        VertexBuffer vertices;

        /* array or uint16_t or uint32_t, depending on length */
        /* populated in such way that shared textures are combined into continuous range */
        VertexBuffer *indices;

        // /* note: this whole scheme only works without taking normals into account, but it's quite fast */
        // struct ModelCacheIndexRange {
        //     Rect srcrect;
        //     size_t offset;
        //     size_t length;
        //     TextureKey texture;
        // } *ranges;

        /* cached base command, modified for ranges */
        DeferredCommand *commands;
    } value;
} *model_cache;

/* TODO: store index to model cache instead */
static struct ModelDrawCommand {
    char *model;
    Vec3 position;
    Vec3 rotation;
    Vec3 scale;
} *model_draw_commands;

/* deferred queue of model files to load from worker threads */
static SDL_mutex *model_load_mutex;
static struct ModelLoadRequest {
    char const *path;
    fastObjMesh *mesh;
    enum {
        /* not yet started, only path is available */
        MODEL_LOAD_REQUEST_WAITING,
        /* initial load of data, unrelated to graphics state and thus applicable to running in worker threads */
        MODEL_LOAD_REQUEST_LOADING,
        /* mesh is loaded and awaits to be prepared and loaded onto gpu */
        MODEL_LOAD_REQUEST_LOADED,
    } stage;
} *model_load_queue;
static bool model_load_initialized;


/* use streaming via callbacks to reduce memory congestion */
static void model_load_callback_close(void *handle, void *udata) {
    (void)udata;
    ((SDL_RWops *)handle)->close(handle);
}

static void *model_load_callback_open(const char *path, void *udata) {
    (void)udata;
    return PHYSFSRWOPS_openRead(path);
}

static size_t model_load_callback_read(void *handle, void *dst, size_t bytes, void *udata) {
    (void)udata;
    return ((SDL_RWops *)handle)->read(handle, dst, 1, bytes);
}

static unsigned long model_load_callback_size(void *handle, void *udata) {
    (void)udata;
    return ((SDL_RWops *)handle)->size(handle);
}


/* it's safe to access everything without lock after this returns true and no public api is possible to call */
bool models_load_workers_finished(void) {
    bool result = true;
    SDL_LockMutex(model_load_mutex);
    for (size_t i = 0; i < arrlenu(model_load_queue); ++i) {
        if (model_load_queue[i].stage != MODEL_LOAD_REQUEST_LOADED) {
            result = false;
            break;
        }
    }
    SDL_UnlockMutex(model_load_mutex);
    return result;
}


/* entry point for workers, polled every time a job semaphore is posted */
/* returns false if there was nothing to do */
bool models_load_workers_thread(void) {
    /* attempt to grab something to work on */
    char const *request_path = NULL;
    ssize_t queue_index = -1;
    SDL_LockMutex(model_load_mutex);
    for (size_t i = 0; i < arrlenu(model_load_queue); ++i) {
        if (model_load_queue[i].stage == MODEL_LOAD_REQUEST_WAITING) {
            request_path = model_load_queue[i].path;
            queue_index = i;
            model_load_queue[i].stage = MODEL_LOAD_REQUEST_LOADING;
            break;
        }
    }
    SDL_UnlockMutex(model_load_mutex);
    /* nothing to do, bail */
    if (queue_index == -1)
        return false;

    fastObjCallbacks const callbacks = {
        .file_close = model_load_callback_close,
        .file_open = model_load_callback_open,
        .file_read = model_load_callback_read,
        .file_size = model_load_callback_size
    };

    /* TODO: would be nice if we could start dependency texture load immediately */
    fastObjMesh *const mesh = fast_obj_read_with_callbacks(request_path, &callbacks, NULL);

    SDL_LockMutex(model_load_mutex);
    model_load_queue[queue_index].mesh = mesh;
    model_load_queue[queue_index].stage = MODEL_LOAD_REQUEST_LOADED;
    SDL_UnlockMutex(model_load_mutex);

    return true;
}


void draw_model(const char *model,
                Vec3 position,
                Vec3 rotation,
                Vec3 scale)
{
    /* TODO: make this all work. */
    SDL_assert_always(false);

    /* if model is missing, queue it up for loading */
    struct ModelCacheItem const *item;
    /* reuse the key from model_cache */
    char *modelcopy;
    if (!(item = shgetp_null(model_cache, model))) {
        modelcopy = SDL_strdup(model);
        shput(model_cache, modelcopy, (struct ModelCacheItemValue) {0});
        SDL_LockMutex(model_load_mutex);
        struct ModelLoadRequest const request = {
            .stage = MODEL_LOAD_REQUEST_WAITING,
            .path = modelcopy,
        };
        arrpush(model_load_queue, request);
        SDL_UnlockMutex(model_load_mutex);
        workers_add_job();
    } else
        modelcopy = item->key;

    struct ModelDrawCommand const command = {
        .model = modelcopy,
        .position = position,
        .rotation = rotation,
        .scale = scale
    };

    arrpush(model_draw_commands, command);
}


/* prepare vertex buffers before textures are ready */
void models_update_pre_textures(void) {
    /* TODO: instead of waiting for all we could start uploading when there's something to upload */
    /*       it's unlikely to happen, but could improve the worst cases */
    while (!models_load_workers_finished())
        SDL_Delay(1);

    /* TODO: it might be better to parallelize this part by sending buffer mappings to workers */
    for (size_t i = 0; i < arrlenu(model_load_queue); ++i) {
        fastObjMesh *const mesh = model_load_queue[i].mesh;
        SDL_assert(mesh && model_load_queue[i].stage == MODEL_LOAD_REQUEST_LOADED);
        struct ModelCacheItem *const item = shgetp(model_cache, model_load_queue[i].path);

        /* calculate required vertex and index buffers */
        /* texture specific index buffers, to later me merged */
        uint32_t **indices = NULL;
        arrsetlen(indices, mesh->texture_count);
        SDL_memset(indices, 0, mesh->texture_count * sizeof (uint32_t *));

        /* vertices are shared for all subcommands */
        struct ModelVertex {
            Vec3 position;
            Vec2 uv;
        } *vertices = NULL;

        for (unsigned int o = 0; o < mesh->object_count; ++o) {
            /* we assume that vertices are only shared within the same object, */
            /* which allows us to keep hash table small in most cases */
            /* it should work great for quake style brush based models */
            struct ModelVertexIndexItem {
                struct ModelVertexIndexItemKey {
                    uint32_t vertex_index;
                    uint32_t uv_index;
                } key;
                uint32_t value; /* index to vertex */
            } *merge_hash = NULL;

            fastObjGroup const *const object = &mesh->objects[o];
            size_t idx = 0;

            for (unsigned int f = 0; f < object->face_count; ++f) {
                unsigned int const fv = mesh->face_vertices[object->face_offset + f];
                unsigned int const mi = mesh->face_materials[object->face_offset + f];
                /* TODO: handle missing */
                fastObjMaterial const *const m = mesh->materials ? &mesh->materials[mi] : NULL;

                /* unwrap polygon fans into triangles, first point is reused for all following */
                fastObjIndex const i0 = mesh->indices[object->index_offset + idx];
                ptrdiff_t i0_hash = hmgeti(merge_hash, ((struct ModelVertexIndexItemKey) { i0.p, i0.t }));
                if (i0_hash == -1) {
                    hmput(merge_hash, ((struct ModelVertexIndexItemKey) { i0.p, i0.t }), arrlenu(vertices));
                    arrpush(vertices, ((struct ModelVertex) {
                        (Vec3) { mesh->positions[3 * i0.p + 0] / 64, mesh->positions[3 * i0.p + 1] / 64, mesh->positions[3 * i0.p + 2] / 64 },
                        (Vec2) { mesh->texcoords[2 * i0.t + 0], mesh->texcoords[2 * i0.t + 1] }
                    }));
                    i0_hash = hmlen(merge_hash) - 1;
                    // i0_hash = arrlenu(vertices) - 1;
                }

                /* other fan points over shifting by 1 window */
                for (unsigned int t = 0; t < fv - 2; ++t) {
                    fastObjIndex const i1 = mesh->indices[object->index_offset + idx + 1 + t];
                    ptrdiff_t i1_hash = hmgeti(merge_hash, ((struct ModelVertexIndexItemKey) { i1.p, i1.t }));
                    if (i1_hash == -1) {
                        hmput(merge_hash, ((struct ModelVertexIndexItemKey) { i1.p, i1.t }), arrlenu(vertices));
                        arrpush(vertices, ((struct ModelVertex) {
                            (Vec3) { mesh->positions[3 * i1.p + 0] / 64, mesh->positions[3 * i1.p + 1] / 64, mesh->positions[3 * i1.p + 2] / 64 },
                            (Vec2) { mesh->texcoords[2 * i1.t + 0], mesh->texcoords[2 * i1.t + 1] }
                        }));
                        i1_hash = hmlen(merge_hash) - 1;
                        // i1_hash = arrlenu(vertices) - 1;
                    }

                    fastObjIndex const i2 = mesh->indices[object->index_offset + idx + 2 + t];
                    ptrdiff_t i2_hash = hmgeti(merge_hash, ((struct ModelVertexIndexItemKey) { i2.p, i2.t }));
                    if (i2_hash == -1) {
                        hmput(merge_hash, ((struct ModelVertexIndexItemKey) { i2.p, i2.t }), arrlenu(vertices));
                        arrpush(vertices, ((struct ModelVertex) {
                            (Vec3) { mesh->positions[3 * i2.p + 0] / 64, mesh->positions[3 * i2.p + 1] / 64, mesh->positions[3 * i2.p + 2] / 64 },
                            (Vec2) { mesh->texcoords[2 * i2.t + 0], mesh->texcoords[2 * i2.t + 1] }
                        }));
                        i2_hash = hmlen(merge_hash) - 1;
                        // i2_hash = arrlenu(vertices) - 1;
                    }

                    arrpush(indices[m->map_Kd], (uint32_t)i0_hash);
                    arrpush(indices[m->map_Kd], (uint32_t)i1_hash);
                    arrpush(indices[m->map_Kd], (uint32_t)i2_hash);
                }

                idx += fv;
            }

            hmfree(merge_hash);
        }

        if (mesh->color_count != 0)
            log_warn("TODO: color in models isn't yet supported");

        /* upload vertices */
        VertexBuffer vertex_buffer = create_vertex_buffer();
        specify_vertex_buffer(vertex_buffer, vertices, arrlenu(vertices) * sizeof (struct ModelVertex));
        item->value.vertices = vertex_buffer;

        /* collect texture usages into index ranges */
        /* TODO: force repeating texture upload before its used in drawing */
        for (size_t t = 0; t < arrlenu(indices); ++t) {
            VertexBuffer index_buffer = create_vertex_buffer();
            specify_vertex_buffer(index_buffer, indices[t], arrlenu(indices[i]) * sizeof (uint32_t));
            arrpush(item->value.indices, index_buffer);

            /* build command */
            DeferredCommandDraw command = {0};

            command.vertices = (AttributeArrayPointer) {
                .arity = 3,
                .type = TWN_FLOAT,
                .stride = sizeof   (struct ModelVertex),
                .offset = offsetof (struct ModelVertex, position),
                .buffer = vertex_buffer
            };

            command.texcoords = (AttributeArrayPointer) {
                .arity = 2,
                .type = TWN_FLOAT,
                .stride = sizeof   (struct ModelVertex),
                .offset = offsetof (struct ModelVertex, uv),
                .buffer = vertex_buffer
            };

            TextureKey const texture_key = textures_get_key(&ctx.texture_cache, mesh->textures[t].name);

            command.textured = true;
            command.texture_key = texture_key;
            command.texture_repeat = true;

            command.element_buffer = index_buffer;
            command.element_count = (uint32_t)(arrlenu(indices[t]));
            command.range_end = (uint32_t)(arrlenu(indices[t]));

            /* TODO: support alpha blended case? */
            TextureMode mode = textures_get_mode(&ctx.texture_cache, texture_key);
            if (mode == TEXTURE_MODE_GHOSTLY)
                mode = TEXTURE_MODE_SEETHROUGH;

            command.texture_mode = mode;
            command.pipeline = PIPELINE_SPACE;

            command.depth_range_high = depth_range_high;
            command.depth_range_low = depth_range_low;

            DeferredCommand final_command = {
                .type = DEFERRED_COMMAND_TYPE_DRAW,
                .draw = command
            };

            arrpush(item->value.commands, final_command);

            arrfree(indices[i]);
        }

        arrfree(vertices);
        arrfree(indices);

        /* TODO: sort ranges based on length in assumption that bigger mesh parts will occlude more */
    }
}


/* adjust uvs into atlases when needed */
void models_update_post_textures(void) {
    SDL_assert(!ctx.texture_cache.is_dirty);

    arrsetlen(model_load_queue, 0);
}


void finally_draw_models(void) {
    for (int i = 0; i < arrlen(model_draw_commands); ++i) {
        struct ModelDrawCommand *const command = &model_draw_commands[i];
        struct ModelCacheItem *const cache = shgetp(model_cache, command->model);
        for (int c = 0; c < arrlen(cache->value.commands); ++c) {
            arrpush(deferred_commands, cache->value.commands[c]);
        }
    }

    arrsetlen(model_draw_commands, 0);
}


/* drop model caches */
void free_model_cache(void) {
    for (size_t i = 0; i < shlenu(model_cache); ++i) {
        // fast_obj_destroy(model_cache[i].value.mesh);
        SDL_free(model_cache[i].key);
    }

    shfree(model_cache);
}


void models_state_init(void) {
    if (model_load_initialized)
        return;
    model_load_mutex = SDL_CreateMutex();
    model_load_initialized = true;
}


void models_state_deinit(void) {
    if (!model_load_initialized)
        return;
    free_model_cache();
    arrfree(model_load_queue);
    SDL_DestroyMutex(model_load_mutex);
    model_load_initialized = false;
}