小弟是新人,但被要求一定要三天内把这段代码给看懂,求各位前辈帮助!
这好像是个贝叶斯的分类算法,里面的一些具体的语句弄不明白
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;namespace Dianping
{
class Category
{
public int type = 0; // 每个分类对应的现有价值
public int num = 0; // 记录此分类出现的次数
public int[,] item = new int[6,3]; // 记录各个属性在此分类中的出现次数
public double probability = 0; // 记录此分类的概率
public double[,] prob_item = new double[6,3]; // 记录此分类各属性的概率
} class Bayes
{
public Category[] category = new Category[9]; // 共有9种分类结果,对应3种现有价值
int totalRecordNum = 0; // 总的记录数 public Category[] currentValue = new Category[3];
} /// <summary>
/// 根据给定的训练数据集,训练Bayes分类器
/// </summary>
/// <param name="tRecords">训练数据集</param>
public void TrainBayes(List<string> tRecords)
{
// 统计各个分类和各个属性的出现次数
Statistics(tRecords);
// 计算各个分类和各个属性的概率
CalcProb();
} /// <summary>
/// 统计某一个训练集中各个分类及该分类下各属性出现的次数
/// </summary>
/// <param name="tRecords">选定的训练样本集记录列表</param>
private void Statistics(List<string> tRecords)
{
totalRecordNum = tRecords.Count();
string[] tItems = new string[] { };
for (int i = 0; i < totalRecordNum; i++)
{
tItems = tRecords[i].Split(',');
// tItems属性顺序: gender,age,education,occupation,income,repast_reason,percapita_cons,cons_times
if (tItems[6] == "100以下") // 人均消费额100以下
{
if (tItems[7] == "1-2次")
{
category[0].num++;
StatisticItemNum(0,tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[1].num++;
StatisticItemNum(1, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else
{
category[2].num++;
StatisticItemNum(2, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
}
else if (tItems[6] == "100-200") // 人均消费额100-200
{ if (tItems[7] == "1-2次")
{
category[3].num++;
StatisticItemNum(3, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[4].num++;
StatisticItemNum(4, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
else
{
category[5].num++;
StatisticItemNum(5, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
}
else // 人均消费额200以上
{ if (tItems[7] == "1-2次")
{
category[6].num++;
StatisticItemNum(6, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[7].num++;
StatisticItemNum(7, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
else
{
category[8].num++;
StatisticItemNum(8, tItems); currentValue[2].num++;
StatisticItemNum2(2, tItems);
}
}
}
} /// <summary>
/// 统计一条某一分类下的记录中各个属性的出现次数
/// </summary>
/// <param name="cid">分类</param>
/// <param name="items">记录</param>
private void StatisticItemNum(int cid,string[] items)
{
int index = 0;
for (int i = 0; i < 6; i++)
{
// MatchItem函数实现了将文字描述的属性值转化为数字(这个数字同时对应着customerProps数组的第二维下标)
index = Dianping.MatchItem((Dianping.CustomerProps)(i + 1), items[i]);
category[cid].item[i,index]++; }
} private void StatisticItemNum2(int cid, string[] items)
{
int index = 0;
for (int i = 0; i < 6; i++)
{
// MatchItem函数实现了将文字描述的属性值转化为数字(这个数字同时对应着customerProps数组的第二维下标)
index = Dianping.MatchItem((Dianping.CustomerProps)(i + 1), items[i]);
currentValue[cid].item[i, index]++;
}
} /// <summary>
/// 计算各个属性在各个分类中出现的先验概率
/// </summary>
private void CalcProb()
{
for (int i = 0; i < 9; i++)
{
category[i].probability = (double)category[i].num / totalRecordNum;
for (int j = 0; j < 6; j++)
{
category[i].prob_item[j, 0] = (double)category[i].item[j, 0] / category[i].num;
category[i].prob_item[j, 1] = (double)category[i].item[j, 1] / category[i].num;
category[i].prob_item[j, 2] = (double)category[i].item[j, 2] / category[i].num;
}
}
for (int i = 0; i < 3; i++)
{
currentValue[i].probability = (double)currentValue[i].num / totalRecordNum;
for (int j = 0; j < 6; j++)
{
currentValue[i].prob_item[j, 0] = (double)currentValue[i].item[j, 0] / currentValue[i].num;
currentValue[i].prob_item[j, 1] = (double)currentValue[i].item[j, 1] / currentValue[i].num;
currentValue[i].prob_item[j, 2] = (double)currentValue[i].item[j, 2] / currentValue[i].num;
}
}
}
这好像是个贝叶斯的分类算法,里面的一些具体的语句弄不明白
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;namespace Dianping
{
class Category
{
public int type = 0; // 每个分类对应的现有价值
public int num = 0; // 记录此分类出现的次数
public int[,] item = new int[6,3]; // 记录各个属性在此分类中的出现次数
public double probability = 0; // 记录此分类的概率
public double[,] prob_item = new double[6,3]; // 记录此分类各属性的概率
} class Bayes
{
public Category[] category = new Category[9]; // 共有9种分类结果,对应3种现有价值
int totalRecordNum = 0; // 总的记录数 public Category[] currentValue = new Category[3];
} /// <summary>
/// 根据给定的训练数据集,训练Bayes分类器
/// </summary>
/// <param name="tRecords">训练数据集</param>
public void TrainBayes(List<string> tRecords)
{
// 统计各个分类和各个属性的出现次数
Statistics(tRecords);
// 计算各个分类和各个属性的概率
CalcProb();
} /// <summary>
/// 统计某一个训练集中各个分类及该分类下各属性出现的次数
/// </summary>
/// <param name="tRecords">选定的训练样本集记录列表</param>
private void Statistics(List<string> tRecords)
{
totalRecordNum = tRecords.Count();
string[] tItems = new string[] { };
for (int i = 0; i < totalRecordNum; i++)
{
tItems = tRecords[i].Split(',');
// tItems属性顺序: gender,age,education,occupation,income,repast_reason,percapita_cons,cons_times
if (tItems[6] == "100以下") // 人均消费额100以下
{
if (tItems[7] == "1-2次")
{
category[0].num++;
StatisticItemNum(0,tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[1].num++;
StatisticItemNum(1, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else
{
category[2].num++;
StatisticItemNum(2, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
}
else if (tItems[6] == "100-200") // 人均消费额100-200
{ if (tItems[7] == "1-2次")
{
category[3].num++;
StatisticItemNum(3, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[4].num++;
StatisticItemNum(4, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
else
{
category[5].num++;
StatisticItemNum(5, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
}
else // 人均消费额200以上
{ if (tItems[7] == "1-2次")
{
category[6].num++;
StatisticItemNum(6, tItems); currentValue[0].num++;
StatisticItemNum2(0, tItems);
}
else if (tItems[7] == "2-4次")
{
category[7].num++;
StatisticItemNum(7, tItems); currentValue[1].num++;
StatisticItemNum2(1, tItems);
}
else
{
category[8].num++;
StatisticItemNum(8, tItems); currentValue[2].num++;
StatisticItemNum2(2, tItems);
}
}
}
} /// <summary>
/// 统计一条某一分类下的记录中各个属性的出现次数
/// </summary>
/// <param name="cid">分类</param>
/// <param name="items">记录</param>
private void StatisticItemNum(int cid,string[] items)
{
int index = 0;
for (int i = 0; i < 6; i++)
{
// MatchItem函数实现了将文字描述的属性值转化为数字(这个数字同时对应着customerProps数组的第二维下标)
index = Dianping.MatchItem((Dianping.CustomerProps)(i + 1), items[i]);
category[cid].item[i,index]++; }
} private void StatisticItemNum2(int cid, string[] items)
{
int index = 0;
for (int i = 0; i < 6; i++)
{
// MatchItem函数实现了将文字描述的属性值转化为数字(这个数字同时对应着customerProps数组的第二维下标)
index = Dianping.MatchItem((Dianping.CustomerProps)(i + 1), items[i]);
currentValue[cid].item[i, index]++;
}
} /// <summary>
/// 计算各个属性在各个分类中出现的先验概率
/// </summary>
private void CalcProb()
{
for (int i = 0; i < 9; i++)
{
category[i].probability = (double)category[i].num / totalRecordNum;
for (int j = 0; j < 6; j++)
{
category[i].prob_item[j, 0] = (double)category[i].item[j, 0] / category[i].num;
category[i].prob_item[j, 1] = (double)category[i].item[j, 1] / category[i].num;
category[i].prob_item[j, 2] = (double)category[i].item[j, 2] / category[i].num;
}
}
for (int i = 0; i < 3; i++)
{
currentValue[i].probability = (double)currentValue[i].num / totalRecordNum;
for (int j = 0; j < 6; j++)
{
currentValue[i].prob_item[j, 0] = (double)currentValue[i].item[j, 0] / currentValue[i].num;
currentValue[i].prob_item[j, 1] = (double)currentValue[i].item[j, 1] / currentValue[i].num;
currentValue[i].prob_item[j, 2] = (double)currentValue[i].item[j, 2] / currentValue[i].num;
}
}
}
/// 给定一条属性记录,得到分类结果
/// </summary>
/// <param name="record">记录数组</param>
/// <returns>分类结果</returns>
public int Classify(string[] record)
{
double[] pro_class = new double[9]; // P(class[i]|record)
int max_class = 0; // 记录后验概率最大的分类索引号 for (int i = 0; i < 9; i++)
{
pro_class[i] = 1;
// record = {item[0],...,item[6]}
// P(class[i]|record) = P(record|class[i])P(class[i])/P(record) = P(class[i].item[0])*...*P(class[i].item[6])*P(class[i])/P(record)
// 由于P(record)都一样,因此可以只比较P(record|class[i])P(class[i])的值
for (int j = 0; j < record.Length; j++)
{
int index = Dianping.MatchItem((Dianping.CustomerProps)(j + 1), record[j]);
pro_class[i] *= category[i].prob_item[j, index]; }
pro_class[i] *= category[i].probability; // 到这里得到后验概率P(class[i]|record
if (pro_class[i] > pro_class[max_class]) // 选择后验概率最大的分类,记在max_class变量中
max_class = i;
}
return max_class;
} // 直接用现有价值高、中、低分类
public int Classify2(string[] record)
{
double[] pro = new double[3];
int max = 0;
for (int i = 0; i < 3; i++)
{
pro[i] = 1;
// record = {item[0],...,item[6]}
// P(class[i]|record) = P(record|class[i])P(class[i])/P(record) = P(class[i].item[0])*...*P(class[i].item[6])*P(class[i])/P(record)
// 由于P(record)都一样,因此可以只比较P(record|class[i])P(class[i])的值
for (int j = 0; j < record.Length && j<6; j++)
{
int index = Dianping.MatchItem((Dianping.CustomerProps)(j + 1), record[j]);
pro[i] *= currentValue[i].prob_item[j, index];
} pro[i] *= currentValue[i].probability; if (pro[i] > pro[max])
max = i;
}
return max;
} /// <summary>
/// 获取当前价值的结果:人均消费额和月消费次数
/// </summary>
/// <param name="max_class"></param>
/// <returns></returns>
public int[] Consumption(int max_class)
{
int[] consumption = new int[2];
switch (max_class)
{
case 0:
consumption[0] = 0;
consumption[1] = 0;
break;
case 1:
consumption[0] = 0;
consumption[1] = 1;
break;
case 2:
consumption[0] = 0;
consumption[1] = 2;
break;
case 3:
consumption[0] = 1;
consumption[1] = 0;
break;
case 4:
consumption[0] = 1;
consumption[1] = 1;
break;
case 5:
consumption[0] = 1;
consumption[1] = 2;
break;
case 6:
consumption[0] = 2;
consumption[1] = 0;
break;
case 7:
consumption[0] = 2;
consumption[1] = 1;
break;
case 8:
consumption[0] = 2;
consumption[1] = 2;
break;
default:
break;
}
return consumption;
} #region 以下用于测试
private void test(List<string> tRecords)
{
int result = 0;
int correct = 0;
int error = 0; FileStream fs1 = new FileStream(@"C:\Users\wucs32\Documents\Visual Studio 2008\Projects\Dianping\Dianping\bin\Debug\App_Data\test2.txt", FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs1, Encoding.Unicode);
FileStream fs = new FileStream(@"C:\Users\wucs32\Documents\Visual Studio 2008\Projects\Dianping\Dianping\bin\Debug\App_Data\result2.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
StreamReader sr = new StreamReader(fs, Encoding.Unicode);
string tRecord = String.Empty;
string s = "";
string[] record = new string[] { };
for (int i = 0; i < tRecords.Count; i++)
{
record = tRecords[i].Split(',');
result = Classify2(record);
s = sr.ReadLine();
if (s == result.ToString())
correct++;
else
error++;
sw.WriteLine(s + " " + result.ToString());
} sw.WriteLine((double)correct / (correct+error));
sr.Close();
fs.Close(); sw.Flush();
sw.Close();
fs.Close();
}
#endregion
}
StatisticItemNum2(0, tItems); int index = Dianping.MatchItem((Dianping.CustomerProps)(j + 1), record[j]);
这些语句是什么意思?
eg:public Category[] category = new Category[9];
category在最上面好像是个class吧,那为什么Category[9]; 这个是数组还是类?
eg:StatisticItemNum2(0, tItems);括号里面这两个参数是什么意思
eg:int index = Dianping.MatchItem((Dianping.CustomerProps)(j + 1), record[j]);这个函数是派什么用的为什么参数是((Dianping.CustomerProps)(j + 1), record[j]);新人问题很多,希望大家能帮帮忙
static public int MatchItem(CustomerProps props, string item)
{
int index = (int)props-1;
int i = 0;
for (i=0; i<3;i++)
{
if (customerProps[index, i] == item)
return i;
}
return 0;
}
public class Customer
{
public int id; // 编号
public int gender; // 性别
public int age; // 年龄
public int education; // 学历
public int occupation; // 职业
public int income; // 月收入
public int repast_reason; // 就餐原因
public int percapita_cons; // 人均消费额
public int cons_times; // 月消费次数
public int firstchoice; // 客户是否将大众点评网作为您的首选
public int rec_restaurant; // 客户是否经常去大众点评网推荐的特约餐厅就餐
public int intro_friend; // 客户是否向周围人介绍过大众点评网
public int rec_member; // 客户是否成功地推荐过别人成为点评网会员
public int usefulness; // 客户是否认为餐厅的介绍信息和评论有用
public int rationality; // 客户觉得网站的积分制度和活动是否合理
public int comment; // 客户是否经常提交评论更新信息
public int participation; // 客户是否乐于参加网站的活动
public int cons_amount; // 客户最近半年累积消费金额
public int current_value; // 客户的当前价值
public int history_value; // 客户的历史价值
public int potential_value; // 客户的潜在价值
public int value; // 客户的价值分类
}
public enum CustomerProps
{
GENDER = 1,
AGE = 2,
EDUCATION = 3,
OCCUPATION = 4,
INCOME = 5,
REPAST_REASON = 6,
PERCAPITA_CONS = 7,
CONS_TIMES = 8,
FIRSTCHOICE = 9,
REC_RESTAURANT = 10,
INTRO_FRIEND = 11,
REC_MEMBER = 12,
USEFULNESS = 13,
RATIONALITY = 14,
COMMENT = 15,
PARTICIPATION = 16,
CONS_AMOUNT = 17, CURRENT_VALUE = 101,
POTENTIAL_VALUE = 102,
HISTORY_VALUE = 103,
VALUE = 104
}