1. 程式人生 > >Java生成-zipf分佈的資料集(自定義傾斜度,用作spark data skew測試)

Java生成-zipf分佈的資料集(自定義傾斜度,用作spark data skew測試)

1.程式碼

import java.io.Serializable;
import java.util.NavigableMap;
import java.util.Random;
import java.util.TreeMap;

public class Zifp_gen implements Serializable {
    private Random random = new Random(0);
    NavigableMap<Double, Integer> map;
    private static final double Constant = 1.0
; public Zifp_gen(int nums, double skewness) { // create the TreeMap map = computeMap(nums, skewness); } //size為rank個數,skew為資料傾斜程度, 取值為0表示資料無傾斜,取值越大傾斜程度越高 private static NavigableMap<Double, Integer> computeMap( int size, double skew) { NavigableMap
<Double, Integer> map = new TreeMap<Double, Integer>(); //總頻率 double div = 0; //對每個rank,計算對應的詞頻,計算總詞頻 for (int i = 1; i <= size; i++) { //the frequency in position i div += (Constant / Math.pow(i, skew)); }
//計算每個rank對應的y值,所以靠前rank的y值區間遠比後面rank的y值區間大 double sum = 0; for (int i = 1; i <= size; i++) { double p = (Constant / Math.pow(i, skew)) / div; sum += p; map.put(sum, i - 1); } return map; } // public int next() { // [1,n] // double value = random.nextDouble(); // //找最近y值對應的rank // return map.ceilingEntry(value).getValue() + 1; // } }

2.test

import java.util.NavigableMap;
public class Test { public static void main(String args[]){ Zifp_gen z1=new Zifp_gen(100,1.0); for (NavigableMap.Entry<Double, Integer> entry : z1.map.entrySet()) { System.out.println("Key = " + entry.getKey() + ", Value = " + entry.getValue()); } } }