/*****************************************************************************

	unsort - reorder files semi-randomly
	Copyright (C) 2008  Wessel Dankers <wsl@fruit.je>

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program.  If not, see <http://www.gnu.org/licenses/>.

*****************************************************************************/

#include <stdbool.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <sys/uio.h>

#include "error.h"
#include "lsort.h"
#include "unfind.h"
#include "iovec.h"
#include "shuffle.h"

uint8_t unfind_sep = '/';
const char *unfind_strategy = "n";

void unfind(struct iovec *iov, uint32_t count, uint32_t *tlb, uint32_t *bounce, shuffle_algo_t shuffle_algo) {
	size_t off = 0;
	uint32_t start = 0, end = count, u;
	uint32_t chunk_start, chunk_count;
	uint8_t *buf0, *buf;
	uint8_t *lim0;
	size_t len0, len;
	uint32_t level;
	uint32_t firstdir, curdir = 0, numdir = 0, maxdir;
	bool hasleaves;
	merge_t *dirs;
	size_t *offs;
	size_t unfind_stategy_len;
	bool stategy;

	unfind_stategy_len = strlen(unfind_strategy);

	maxdir = count * 2 + 2;

	dirs = xalloc(maxdir * sizeof *dirs);
	offs = xalloc(maxdir * sizeof *offs);

	for(level = 0;; level++) {
		/*
		** In each iteration, we scan and find as many
		** entries with a common prefix as we can.
		** Our initial domain is the entire input.
		** We remember an offset for each range we find.
		** The input is sorted leaves-first.
		** So if the first entry is not a leaf, this directory
		** does not contain leaves at all.
		*/ 
		buf0 = iov[start].iov_base;
		len0 = iov[start].iov_len;
		buf0 += off;
		len0 -= off;
		lim0 = memchr(buf0, unfind_sep, len0);
		u = start + 1;
		firstdir = numdir;

		stategy = unfind_strategy[level < unfind_stategy_len ? level : unfind_stategy_len - 1] == 'n';

		if(lim0) {
			/* first entry is not a leaf */
			hasleaves = false;
			lim0++;
			len0 = (size_t)(lim0 - buf0);
			while(u < end) {
				buf = iov[u].iov_base;
				len = iov[u].iov_len;
				buf += off;
				len -= off;
				if(len < len0)
					break;
				if(memcmp(buf0, buf, len0))
					break;
				u++;
			}
			/*
			** If this directory contains everything to the end
			** we might as well ignore this entire level.
			*/
			if(u == end) {
				/* collapse */
				off += len0;
				continue;
			}
			if(numdir >= maxdir)
				exit_error(ERROR_INTERNAL, "%s:%d: Internal error: numdir >= maxdir", __FILE__, __LINE__);
			dirs[numdir].start = start;
			dirs[numdir].count = u - start;
			dirs[numdir].ratio = stategy ? u - start : 1;
			dirs[numdir].cursor = 0;
			offs[numdir] = off + len0;
			numdir++;
		} else {
			/* first entry is a leaf, grab everything up to the first non-leaf */
			hasleaves = true;
			while(u < end) {
				buf = iov[u].iov_base;
				len = iov[u].iov_len;
				buf += off;
				len -= off;
				if(memchr(buf, unfind_sep, len))
					break;
				u++;
			}
			if(numdir >= maxdir)
				exit_error(ERROR_INTERNAL, "%s:%d: Internal error: %"PRIu32" >= maxdir", __FILE__, __LINE__);
			dirs[numdir].start = start;
			dirs[numdir].count = u - start;
			dirs[numdir].ratio = stategy ? u - start : 1;
			dirs[numdir].cursor = 0;
			numdir++;
		}
		while(u < end) {
			/*
			** Grab all remaining dirs. Pretty much the same as
			** the non-leaf case above, except we don't need to
			** worry about collapsing stuff anymore.
			*/
			chunk_start = u;
			buf0 = iov[chunk_start].iov_base;
			len0 = iov[chunk_start].iov_len;
			buf0 += off;
			len0 -= off;
			lim0 = memchr(buf0, unfind_sep, len0);
			if(!lim0)
				exit_error(ERROR_INTERNAL, "Internal error u:%"PRIu32" off:%"PRIu64, u, (uint64_t)off);
			lim0++;
			len0 = (size_t)(lim0 - buf0);
			u = chunk_start + 1;
			while(u < end) {
				buf = iov[u].iov_base;
				len = iov[u].iov_len;
				buf += off;
				len -= off;
				if(len < len0)
					break;
				if(memcmp(buf0, buf, len0))
					break;
				u++;
			}
			if(numdir >= maxdir)
				exit_error(ERROR_INTERNAL, "%s:%d: Internal error: numdir >= maxdir", __FILE__, __LINE__);
			dirs[numdir].start = chunk_start;
			dirs[numdir].count = u - chunk_start;
			dirs[numdir].ratio = stategy ? u - chunk_start : 1;
			dirs[numdir].cursor = 0;
			offs[numdir] = off + len0;
			numdir++;
		}

		/*
		** If we managed to grab more than one section, we can now
		** merge them by interleaving.
		*/
		if(numdir - firstdir > 1) {
			/* Merge the entries from these dirs. */
			merge(dirs + firstdir, numdir - firstdir, tlb + start, bounce, shuffle_algo != shuffle_none);
			/* Copy back from the destination buffer. */
			memcpy(tlb + start, bounce + start, count * sizeof *tlb);
			/*
			** The merge() call shuffles our directory nodes, so sort 'm back into
			** the original order.
			** Possible optimization: just find the leaves node (if any) and put it
			** back into the first spot.
			*/
			merge_sort(dirs + firstdir, numdir - firstdir);
		}

		/*
		** Leaves do not have an internal structure, so just apply
		** normal randomization to them.
		*/
		if(hasleaves) {
			chunk_count = dirs[firstdir].count;
			if(shuffle_algo != shuffle_none) {
				shuffle_algo(tlb + start, bounce, chunk_count);
				memcpy(tlb + start, bounce, chunk_count * sizeof *tlb);
			}

			/* Remove the leaves node so we don't recurse on it. */
			numdir--;
			dirs[firstdir] = dirs[numdir];
			offs[firstdir] = offs[numdir];
		}

		/*
		** Set our coordinates to the next node in the list
		** to prepare for the next iteration.
		*/
		if(curdir < numdir) {
			start = dirs[curdir].start;
			count = dirs[curdir].count;
			off = offs[curdir];
			end = start + count;
			curdir++;
		} else {
			break;
		}
	}

	free(offs);
	free(dirs);
}

#ifdef TEST_UNFIND

int main(void) {
	uint32_t u, count;
	struct iovec *vecjes;
	merge_t *dirs;
	uint32_t *tlb, *bounce;
	const char *str[] = {
		"a/a/a",
		"a/b/a",
		"b/a/a"
/*
		"a",
		"a",
		"a/b/1",
		"a/b/2",
		"x",
		"x",
		"x",
		"q",
		"q",
		"q",
		"y",
		"y",
		"a/b/3",
		"a/b/4",
		"a/b/5",
		"a/c/8",
		"a/c/9",
		"a/c/0",
		"b/c"
*/
	};
	count = sizeof str / sizeof *str;
	vecjes = xalloc(count * sizeof *vecjes);
	tlb = xalloc(count * sizeof *tlb);
	bounce = xalloc(count * sizeof *bounce);
	for(u = 0; u < count; u++) {
		vecjes[u].iov_base = strdup(str[u]);
		vecjes[u].iov_len = strlen(str[u]);
		tlb[u] = u;
	}
	lsort(vecjes, count);
	for(u = 0; u < count; u++)
		warn_error("| %s", vecjes[u].iov_base);
	dirs = xalloc(count * sizeof *dirs);
	unfind(vecjes, count, tlb, bounce, shuffle_random);
	iovec_shuffle(vecjes, tlb, count);
	warn_error("");
	for(u = 0; u < count; u++)
		warn_error("| %s", vecjes[u].iov_base);
	return 0;
}

#endif
