Select ranges with max simultaneous processes (SQLite)

how can i select ranges with max simultaneous processes so first should be {start: "2021-01-01 03:00:00", end: "2021-01-01 06:00:00", count: 3}
BEGIN TRANSACTION;
CREATE TABLE IF NOT EXISTS "events" (
"id" INTEGER,
"cid" INTEGER NOT NULL,
"type" TEXT NOT NULL,
"date" DATETIME NOT NULL,
PRIMARY KEY("id")
);
INSERT INTO "events" ("id","cid","type","date")
VALUES
(1,0,'start','2021-01-01 00:00:00'),
(2,2,'start','2021-01-01 02:00:00'),
(3,1,'start','2021-01-01 03:00:00'),
(4,0,'end','2021-01-01 06:00:00'),
(5,1,'end','2021-01-01 07:00:00'),
(6,3,'start','2021-01-01 08:00:00'),
(7,3,'end','2021-01-01 08:30:00'),
(8,4,'start','2021-01-01 08:45:00'),
(9,2,'end','2021-01-01 09:00:00'),
(10,5,'start','2021-01-01 10:00:00'),
(11,6,'start','2021-01-01 11:00:00'),
(12,4,'end','2021-01-01 12:00:00'),
(13,5,'end','2021-01-01 13:00:00'),
(14,7,'start','2021-01-01 14:00:00'),
(15,7,'end','2021-01-01 15:00:00'),
(16,6,'end','2021-01-01 16:00:00'),
(17,8,'start','2021-01-01 15:30:00'),
(18,9,'start','2021-01-01 16:30:00'),
(19,10,'start','2021-01-01 17:00:00'),
(20,8,'end','2021-01-01 18:00:00'),
(21,10,'end','2021-01-01 19:00:00'),
(22,11,'start','2021-01-01 19:30:00'),
(23,11,'end','2021-01-01 19:45:00'),
(24,9,'end','2021-01-01 19:59:00');
COMMIT;
BEGIN TRANSACTION;
CREATE TABLE IF NOT EXISTS "events" (
"id" INTEGER,
"cid" INTEGER NOT NULL,
"type" TEXT NOT NULL,
"date" DATETIME NOT NULL,
PRIMARY KEY("id")
);
INSERT INTO "events" ("id","cid","type","date")
VALUES
(1,0,'start','2021-01-01 00:00:00'),
(2,2,'start','2021-01-01 02:00:00'),
(3,1,'start','2021-01-01 03:00:00'),
(4,0,'end','2021-01-01 06:00:00'),
(5,1,'end','2021-01-01 07:00:00'),
(6,3,'start','2021-01-01 08:00:00'),
(7,3,'end','2021-01-01 08:30:00'),
(8,4,'start','2021-01-01 08:45:00'),
(9,2,'end','2021-01-01 09:00:00'),
(10,5,'start','2021-01-01 10:00:00'),
(11,6,'start','2021-01-01 11:00:00'),
(12,4,'end','2021-01-01 12:00:00'),
(13,5,'end','2021-01-01 13:00:00'),
(14,7,'start','2021-01-01 14:00:00'),
(15,7,'end','2021-01-01 15:00:00'),
(16,6,'end','2021-01-01 16:00:00'),
(17,8,'start','2021-01-01 15:30:00'),
(18,9,'start','2021-01-01 16:30:00'),
(19,10,'start','2021-01-01 17:00:00'),
(20,8,'end','2021-01-01 18:00:00'),
(21,10,'end','2021-01-01 19:00:00'),
(22,11,'start','2021-01-01 19:30:00'),
(23,11,'end','2021-01-01 19:45:00'),
(24,9,'end','2021-01-01 19:59:00');
COMMIT;
I managed to get first result, though I'm pretty sure it's accidental
SELECT e1.type, e1.date, e2.type, e2.date, count(e1.date < e2.date) as count FROM events AS e1
join events as e2
where e1.type = 'start' and e2.type = 'end' and e1.date < e2.date
group by e2.date
limit 1;
SELECT e1.type, e1.date, e2.type, e2.date, count(e1.date < e2.date) as count FROM events AS e1
join events as e2
where e1.type = 'start' and e2.type = 'end' and e1.date < e2.date
group by e2.date
limit 1;
unfortunately I have close to 0 experience with sql http://sqlfiddle.com/
28 Replies
13eck
13eck•11mo ago
You're trying to get
{
start: "2021-01-01 03:00:00",
end: "2021-01-01 06:00:00",
count: 3
}
{
start: "2021-01-01 03:00:00",
end: "2021-01-01 06:00:00",
count: 3
}
Start/end is from rows 1 and 4: the start and end event for the cid (what is cid?). Where is the count coming from? The number of start events from start to end inclusive? Also, if you change your table definition to "id" INTEGER PRIMARY KEY the database will assign an auto-incrementing ID number for you, no need to include it in your INSERT statement.
__vasilich__
__vasilich__•11mo ago
cid - process id, I think it's not needed, count should be calculated - the count of processes running in the range. so on 1 row we have 1 process running, on 2 - 2, on 3 - 3, on 4 - 2 because one is ended, so on 3 row was the start of the range and on 4 row is the end, and count is 3. that's how we get
{
start: "2021-01-01 03:00:00",
end: "2021-01-01 06:00:00",
count: 3
}
{
start: "2021-01-01 03:00:00",
end: "2021-01-01 06:00:00",
count: 3
}
Jochem
Jochem•11mo ago
So if I'm understanding you correctly, you want the periods, not the events? So SQLITE should come up with the time during which the most events are running simultaneously, not the event which has the most overlap? I managed to get the latter working
WITH cte AS (SELECT e1.cid, e1.date as start_date, e2.date as end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start" order by e1.cid
)

SELECT *, (
SELECT COUNT(*) FROM cte cte2
WHERE cte1.cid <> cte2.cid
AND (
cte2.start_date BETWEEN cte1.start_date AND cte1.end_date
OR cte2.end_date BETWEEN cte1.start_date AND cte1.end_date
OR cte2.start_date = cte1.start_date
OR cte2.end_date = cte1.end_date
)
) from cte cte1;
WITH cte AS (SELECT e1.cid, e1.date as start_date, e2.date as end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start" order by e1.cid
)

SELECT *, (
SELECT COUNT(*) FROM cte cte2
WHERE cte1.cid <> cte2.cid
AND (
cte2.start_date BETWEEN cte1.start_date AND cte1.end_date
OR cte2.end_date BETWEEN cte1.start_date AND cte1.end_date
OR cte2.start_date = cte1.start_date
OR cte2.end_date = cte1.end_date
)
) from cte cte1;
SQLITE coming up with its own fresh interval where the overlap is biggest is going to be a lot more complicated though, I think
Jochem
Jochem•11mo ago
I put the events in a google calendar to help visualize, and you're right that the max is 3. It occurs three times though:
No description
Jochem
Jochem•11mo ago
I think you'd have to find a way to create a temporary table where each event is split into <precision> intervals, probably 1 minute each, and then do an aggregate on the new temporary table I'm honestly not sure if this is something that's best done in query, instead of in a more regular programming environment I've gotten it to create a temporary table minutes_cte where each minute in the given intervals is present as a row, then gotten it to calculate the number of concurrent events for each row:
WITH cte AS (SELECT e1.cid, e1.date as start_date, e2.date as end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start" order by e1.cid
),
minutes_cte(cid,minutes,end_date) AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte
UNION ALL
SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
)

SELECT
minutes, COUNT(*) AS concurrent_events
FROM minutes_cte GROUP BY minutes ORDER BY minutes ASC, concurrent_events DESC;
WITH cte AS (SELECT e1.cid, e1.date as start_date, e2.date as end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start" order by e1.cid
),
minutes_cte(cid,minutes,end_date) AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte
UNION ALL
SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
)

SELECT
minutes, COUNT(*) AS concurrent_events
FROM minutes_cte GROUP BY minutes ORDER BY minutes ASC, concurrent_events DESC;
the issue I'm running into now, is that I could get the start of the first period where there are three concurrent events, and the end of the last period where there are three concurrent events, but the end of the first, any in between, and the start of the last are a lot harder to get So don't ask me how, but
WITH RECURSIVE cte AS (
SELECT
e1.cid,
e1.date AS start_date,
e2.date AS end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start"
ORDER BY e1.cid
),
minutes_cte AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte

UNION ALL

SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
),
conc AS (
SELECT
COUNT(*) OVER (ORDER BY minutes) AS id,
minutes,
COUNT(*) AS concurrent_events
FROM minutes_cte
GROUP BY minutes
ORDER BY minutes ASC, concurrent_events DESC
),
MaxCev AS (
SELECT MAX(concurrent_events) AS max_cev
FROM conc
)

SELECT
(
SELECT c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) AS period_start,
DATETIME(
(
SELECT c.minutes AS period_end
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
),
'-1 minute'
) AS period_end,
(SELECT max_cev from MaxCev) AS event_count
WITH RECURSIVE cte AS (
SELECT
e1.cid,
e1.date AS start_date,
e2.date AS end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start"
ORDER BY e1.cid
),
minutes_cte AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte

UNION ALL

SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
),
conc AS (
SELECT
COUNT(*) OVER (ORDER BY minutes) AS id,
minutes,
COUNT(*) AS concurrent_events
FROM minutes_cte
GROUP BY minutes
ORDER BY minutes ASC, concurrent_events DESC
),
MaxCev AS (
SELECT MAX(concurrent_events) AS max_cev
FROM conc
)

SELECT
(
SELECT c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) AS period_start,
DATETIME(
(
SELECT c.minutes AS period_end
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
),
'-1 minute'
) AS period_end,
(SELECT max_cev from MaxCev) AS event_count
will give you the first occurrence of any period in the events table with the highest number of concurrent events it uses common table expressions (WITH) to create a series of temporary tables. The first (cte) takes the events entries and condenses them down to a single row per event with a start and end time. The second (minutes_cte) takes those events and extrapolates them to one row per minute for each event, maintaining the cid The third (conc) then takes that list, adds a unique id we'll need later, and adds the number of concurrent events. minutes_cte and conc could theoretically be combined The final one (MaxCev, or max concurrent_events) is a little helper because SQLITE doesn't like doing WHERE cev = (SELECT max(cev) FROM conc), it's just there to give access to the maximum number of concurrent events The select then does two subqueries. One to fetch the start date (which is ordered by minutes (which should technically probably be minute, but that's a reserved word in SQL) in conc) of the first event The second one fetches the first occurrence where concurrent_events drops again after having been maxed out. This would be the next minute in the next event, so it subtracts a minute from the result there is one edge case though. If the maximum number of events period ends at the same time as all the events in that period (so you have three events that all end at 4pm), it'll break because it'll find the next minute in the minutes_cte table that has fewer than the max number of events, which might not be the very next minute after the end of the events Which you can fix by changing the final SELECT to
SELECT
(
SELECT c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) AS period_start,
(
SELECT minutes
FROM conc
WHERE id =
(
SELECT p.id AS period_end
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) AS period_end,
(
SELECT max_cev FROM MaxCev
) AS event_count
SELECT
(
SELECT c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) AS period_start,
(
SELECT minutes
FROM conc
WHERE id =
(
SELECT p.id AS period_end
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) AS period_end,
(
SELECT max_cev FROM MaxCev
) AS event_count
The only bug that is left, is that it won't work if the first occurrence of the highest concurrence also ends all at the same time, and is also the final event(s) in the table I think it should return nothing as the end date, but still find the start date and max concurrency without trouble @vasilich sorry to ping, just really want to make sure you see this 😅
__vasilich__
__vasilich__•11mo ago
WOW... I'll check it all later, bit busy at the moment, but on the first glance it's way beyond my knowledge)
Jochem
Jochem•11mo ago
I got massively nerd-sniped and learned a ton about these kinds of queries, so mostly mine too ;D
ErickO
ErickO•11mo ago
what in the world I can only trust that Jochem's solution works I'm just not sure if it...should this just seems like the wrong approach altogether
Jochem
Jochem•11mo ago
I don't disagree with that tbh it's much better to do this in memory in code rather than mangled temporary tables
__vasilich__
__vasilich__•11mo ago
@Jochem ok, I finally got time to check you code(sorry about that), it does work, thanks!
Jochem
Jochem•11mo ago
glad to help!
__vasilich__
__vasilich__•11mo ago
I suspected it would require to do subqueries, but I hoped it would be simpler)
Jochem
Jochem•11mo ago
it might be possible to simplify it, but the separate start and end records make that a bit complicated I also assumed that these events are one-and-done, so they each have exactly one start and one end, so if they're recurring and dont' get new cids that might be an issue
__vasilich__
__vasilich__•11mo ago
yeah, they're unique at the moment I'm trying to figure out how to get last period, and both(first and last)
Jochem
Jochem•11mo ago
SELECT
period_start, period_end, (SELECT max_cev FROM MaxCev) AS event_count
FROM (
SELECT COUNT(*) over (ORDER BY c.minutes) AS id, c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) starts
LEFT JOIN
(
SELECT COUNT(*) OVER (ORDER BY minutes) AS id, minutes AS period_end
FROM conc
WHERE id IN
(
SELECT p.id
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) ends USING(id)
SELECT
period_start, period_end, (SELECT max_cev FROM MaxCev) AS event_count
FROM (
SELECT COUNT(*) over (ORDER BY c.minutes) AS id, c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) starts
LEFT JOIN
(
SELECT COUNT(*) OVER (ORDER BY minutes) AS id, minutes AS period_end
FROM conc
WHERE id IN
(
SELECT p.id
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) ends USING(id)
the main difference is the IN in the second query, it gives you the full list of end-times instead of just the first. The COUNT(*) OVER (ORDER BY minutes) is generating an ID like the other query, and then that is used to join the queries. This works, but it feels very loose. The only thing that's keeping the periods aligned, is the sort order of minutes_cte, which I wouldn't want to rely on for anything critical.
__vasilich__
__vasilich__•11mo ago
THANKS!!! as far as I know sql was made for regular people to understand, but in my mind it hasn't clicked yet. Basic stuff are understandable though so to get first row I can use LIMIT 1, and for the last one ORDER BY period_start DESC LIMIT 1, is there a simple way to get both?
Jochem
Jochem•11mo ago
a union of the LIMIT 1 and ORDER BY ... LIMIT 1 would work but we're well past what you should be doing in SQL... if I were the developer on this thing, I'd refactor the events table to have a single row per event with a start_time and end_time column, then use a for loop in whatever language I was using to find the maximum concurrent events rather than use SQL for that all
__vasilich__
__vasilich__•11mo ago
I agree
Jochem
Jochem•11mo ago
and even if I had, I'd just get the full result set (cause it's not going to be thousands of rows, right?) and then just pop and unshift the first and last element off into a new array to have the first and last entries
__vasilich__
__vasilich__•11mo ago
yeah, you right
Jochem
Jochem•11mo ago
like, it was fun figuring this stuff out, but if I saw this in a code review I'd put a big red NOPE stamp on it out of general principle
__vasilich__
__vasilich__•11mo ago
KISS not even close)
Jochem
Jochem•11mo ago
yup 😄
__vasilich__
__vasilich__•11mo ago
WITH RECURSIVE cte AS (
SELECT
e1.cid,
e1.date AS start_date,
e2.date AS end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start"
ORDER BY e1.cid
),
minutes_cte AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte

UNION ALL

SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
),
conc AS (
SELECT
COUNT(*) OVER (ORDER BY minutes) AS id,
minutes,
COUNT(*) AS concurrent_events
FROM minutes_cte
GROUP BY minutes
ORDER BY minutes ASC, concurrent_events DESC
),
MaxCev AS (
SELECT MAX(concurrent_events) AS max_cev
FROM conc
),
ResultTable as (SELECT
period_start, period_end, (SELECT max_cev FROM MaxCev) AS event_count
FROM (
SELECT COUNT(*) over (ORDER BY c.minutes) AS id, c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) starts
LEFT JOIN
(
SELECT COUNT(*) OVER (ORDER BY minutes) AS id, minutes AS period_end
FROM conc
WHERE id IN
(
SELECT p.id
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) ends USING(id))
SELECT * FROM (SELECT * FROM ResultTable LIMIT 1)
UNION ALL
SELECT * FROM (SELECT * FROM ResultTable
ORDER BY period_start DESC LIMIT 1)
WITH RECURSIVE cte AS (
SELECT
e1.cid,
e1.date AS start_date,
e2.date AS end_date
FROM events e1
LEFT JOIN events e2 ON e1.cid = e2.cid AND e2.type = "end"
WHERE e1.type = "start"
ORDER BY e1.cid
),
minutes_cte AS (
SELECT
cid,
DATETIME(cte.start_date) AS minutes,
end_date
FROM cte

UNION ALL

SELECT
cid,
DATETIME(minutes, '+1 minute'),
end_date
FROM minutes_cte
WHERE DATETIME(minutes, '+1 minute') <= DATETIME(end_date)
),
conc AS (
SELECT
COUNT(*) OVER (ORDER BY minutes) AS id,
minutes,
COUNT(*) AS concurrent_events
FROM minutes_cte
GROUP BY minutes
ORDER BY minutes ASC, concurrent_events DESC
),
MaxCev AS (
SELECT MAX(concurrent_events) AS max_cev
FROM conc
),
ResultTable as (SELECT
period_start, period_end, (SELECT max_cev FROM MaxCev) AS event_count
FROM (
SELECT COUNT(*) over (ORDER BY c.minutes) AS id, c.minutes AS period_start
FROM conc c
LEFT JOIN conc p ON p.id = c.id - 1
JOIN MaxCev m ON c.concurrent_events = m.max_cev
WHERE c.concurrent_events > p.concurrent_events
) starts
LEFT JOIN
(
SELECT COUNT(*) OVER (ORDER BY minutes) AS id, minutes AS period_end
FROM conc
WHERE id IN
(
SELECT p.id
FROM conc c
LEFT JOIN conc p ON c.id = p.id + 1
JOIN MaxCev m ON p.concurrent_events = m.max_cev
WHERE c.concurrent_events < p.concurrent_events
)
) ends USING(id))
SELECT * FROM (SELECT * FROM ResultTable LIMIT 1)
UNION ALL
SELECT * FROM (SELECT * FROM ResultTable
ORDER BY period_start DESC LIMIT 1)
Jochem
Jochem•11mo ago
clever!
ErickO
ErickO•11mo ago
I gotta say, if you're able to control what the db looks like I'd probably change it a bit but if this isn't your project and you're stuck with that structure then carry on
__vasilich__
__vasilich__•11mo ago
second one
ErickO
ErickO•11mo ago
sad, this is why you shouldn't skip the db design class PepeLaugh might've been true in 1998 but a lot has changed since then there's a reason it's a whole separate job these days