Getting histograms of data

When coming into a new environment and trying to learn a bit about the data my customer has; one of the tools I use is to grab histograms of key data. Now in sqlcl and sql developer you have the info+ command that will display histograms; however, if the optimizer did not generate histograms, the information provided will be of limited usefulness (in the sense of histograms)

A histogram will give you information on how the data is distributed, with text data, this is fairly easy using select column;, count(*) from t1 group by column1;

Dates and numbers, well that’s another story.

First I need to load up a table with some test data. updt_stat simply takes the order date and returns a status. This is to simulate and order entry system.

create sequence t1_seq;
create table t1 (
id number     primary key,
order_date    date            not null,
col1          number          not null,
status        varchar2(25)    not null);

alter table t1 modify id default t1_seq.nextval;
create or replace function updt_stat (pdate in date) return varchar2 as
value varchar2(10);
tmp   number;
begin
  tmp := dbms_random.value(0,10);
  case
    when pdate > sysdate - 50 and tmp > 9
        then value := 'Canceled';
    when pdate <= sysdate - 50
        then value := 'Shipped';
    when pdate >= sysdate - 50 and pdate < sysdate - 1
        then value := 'Picked';
    when pdate >= sysdate
        then value := 'Ordered';
    else
        value := 'Back Order';
  end case;
  return value;
end;
/

Now populate the table with some test data.

declare
--create or replace procedure simulate_order_entry as

  i     number;     -- an index variable
  sdate date;       -- the date we are going to insert.

  cursor c1 is
  select dbms_random.value(0,10000) col1
  from dual
  connect by level <=36500;
begin

  -- one year of data
  select sysdate-365 into sdate from dual;
  i := 1;

  -- start the loop, 36,500 times
  -- one year of data

  for rec in c1
  loop
    if ( i / 100 = 1 ) then
      i := 1;
      sdate := sdate + 1;
    end if;

    insert into t1 (order_date, col1, status)
    values
    (sdate, rec.col1, updt_stat(sdate));

    i := i+1;       -- i++
  end loop;
  commit;
end;
/
SQL> select status, count(*)
  2  from t1
  3* group by status;

STATUS           COUNT(*)
_____________ ___________
Shipped             31284
Picked               4334
Canceled              553
Back Order             92
Ordered               237
SQL> select min(order_date),
  2          max(order_date),
  3          width_bucket(order_date, sysdate-365, sysdate+20, 12) bucket,
  4          count(*)
  5   from t1
  6   group by width_bucket(order_date, sysdate-365, sysdate+20, 12)
  7*  order by 1;

MIN(ORDER_DATE)    MAX(ORDER_DATE)       BUCKET    COUNT(*)
__________________ __________________ _________ ___________
16-MAY-22          16-MAY-22                  0          99
17-MAY-22          17-JUN-22                  1        3168
18-JUN-22          19-JUL-22                  2        3168
20-JUL-22          20-AUG-22                  3        3168
21-AUG-22          21-SEP-22                  4        3168
22-SEP-22          23-OCT-22                  5        3168
24-OCT-22          24-NOV-22                  6        3168
25-NOV-22          26-DEC-22                  7        3168
27-DEC-22          27-JAN-23                  8        3168
28-JAN-23          28-FEB-23                  9        3168
01-MAR-23          02-APR-23                 10        3267
03-APR-23          04-MAY-23                 11        3168
05-MAY-23          19-MAY-23                 12        1454
SQL> select trunc(min(col1),4),
  2          trunc(max(col1),4),
  3          width_bucket(col1, 0, 10000, 10) bucket,
  4          count(*)
  5  from t1
  6  group by width_bucket(col1, 0, 10000, 10)
  7* order by 1;

   TRUNC(MIN(COL1),4)    TRUNC(MAX(COL1),4)    BUCKET    COUNT(*)
_____________________ _____________________ _________ ___________
               0.1578              999.9308         1        3710
            1000.0824             1999.9868         2        3543
            2001.0208             2999.6238         3        3640
            3000.4055             3999.8725         4        3692
            4000.2374              4999.823         5        3751
            5000.1659             5999.6815         6        3767
            6000.3364             6999.9993         7        3660
            7000.1511             7999.4147         8        3471
             8000.284              8999.706         9        3651
            9000.1652             9999.5417        10        3615
This entry was posted in infosec. Bookmark the permalink.