systemml/scripts/datagen/genRandData4DescriptiveStats.dml at master · feihugis/systemml

149 lines (124 loc) · 4.75 KB
#-------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#   http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#-------------------------------------------------------------
------------------------------------------------
------------------------------------------------
$R          = #rows
$C          = #columns
$NC         = number of categorical attributes
$MAXDOMAIN  = maximum domain size
$DATA       = output file path on HDFS
$SETSIZE    = Size of one bivariate set
$LABELSETSIZE= Size of second bivariate set with labels
$TYPES      = output attribute types
$TYPES1     = Attribute types for Set1
$TYPES2     = Attribute types for Set2
$INDEX1     = Indices for Set1
$INDEX2     = Indices for Set2
$FMT        = output format
------------------------------------------------
hadoop jar SystemML.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
------------------------------------------------
FMT = ifdef($FMT,"binary"); # default output format
# number of categorical attributes.. numC <= C
numC = $NC;
numO = as.integer(numC/2);
numNominal = numC - numO;
print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
# maximum domain size among all categorical attributes
maxDomainSize = $MAXDOMAIN;
# Divide $C attributes according to the following logic:
#   1     numO       numC               C
#   |-------|---------|-----------------|
#      ord    nominal    scale
# numC+1-$C: scale
# 1-numC/2: ordinal
# (numC/2+1)-numC: nominal
types = matrix(1, rows=1, cols=$C);
ocutoff = numO;
types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
# Generate data
A = rand(rows=$R, cols=$C, sparsity=1);
B = matrix(0,rows=nrow(A), cols=ncol(A));
parfor (i in 1:numC) {
    Ai = A[,i];
    tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
    domain = as.scalar(tmp[1,1]);
    # for some attributes, choose the maxDomainSize
    tmp = rand(rows=1,cols=1);
    if (as.scalar(tmp[1,1]) < 0.5) {
        domain = maxDomainSize;
    B[,i] = round(1+(domain-1)*Ai);
B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
write(B, $DATA, format=FMT);
write(types, $TYPES, format=FMT);
# ----- Generator for Bivariate ---------
settypes1 = matrix(1, rows=1, cols=$SETSIZE);
index1   = matrix(0, rows=1, cols=$SETSIZE);
catSetSize = as.integer($SETSIZE/2);
ocutoff = as.integer(catSetSize/2);
print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
settypes1[1, ocutoff+1:catSetSize] = matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
# select ordinal indices
tmp = rand(rows=1, cols=ocutoff);
index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
# select nominal indices
nominalSetSize = catSetSize-ocutoff;
tmp = rand(rows=1, cols=nominalSetSize);
index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
# select scale attributes
scaleSetSize = $SETSIZE-catSetSize;
tmp = rand(rows=1, cols=scaleSetSize);
index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
# --- select types and indices for LABELSET
settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
index2   = matrix(0, rows=1, cols=$LABELSETSIZE);
if($LABELSETSIZE > 1) {
    settypes2[1,1] = 1;
    r = as.scalar(rand(rows=1,cols=1));
    index2[1,1] = round(numC+1 + ($C-numC-1)*r)
    r = as.scalar(rand(rows=1,cols=1));
    index2[1,1] = round( numO+1 + (numC-numO-1)*r )
for(i in 2:as.integer($LABELSETSIZE/2)) {
    settypes2[1,i] = 3;
    r = as.scalar(rand(rows=1,cols=1));
    index2[1,i] = round( 1 + (numO-1)*r )
for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
    settypes2[1,i] = 2;
    r = as.scalar(rand(rows=1,cols=1));
    index2[1,i] = round( numO+1 + (numC-numO-1)*r )
write(settypes1, $TYPES1, format=FMT);
write(settypes2, $TYPES2, format=FMT);
write(index1, $INDEX1, format=FMT);
write(index2, $INDEX2, format=FMT);
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

genRandData4DescriptiveStats.dml

Latest commit

History

genRandData4DescriptiveStats.dml

File metadata and controls