依存树解析的JAVA接口
[b]摘要:[/b] 本文通过代码形式将调用依存树解析的JAVA接口曾现给读者。有了这些代码和哈工大的共享资源库,就可以编译生成完整的JAVA可调用的DLL文件。这篇文章没有讲解基本的JNI技术,对JNI不熟悉的读者请参考相关文章。
[b]本文JNI cpp代码的两大看点:[/b]
1、解决中文在使用JNI技术出现乱码的问题。
2、在C++端直接生成JAVA的ArrayList对象。
[b]正文:[/b]
近日使用哈工大的句法分析模块(依存树解析)做一些小实验。为了方便JAVA调用,利用JNI技术写了一个java调用接口。其他如分词等等的做法也可利用本文做参考。
[b]注:[/b]
VC工程 需要哈工大的程序库相关DLL,头文件(__ltp_dll.h)和CPP文件(__ltp_dll_x.cpp)的支持。
使用的哈工大程序库版本为1.3.2
XML函数库使用的是jdom(版本号未知,需要的读者可以给我发EMAIL索取)。
[b]1、 JAVA类:DependencyTreeNode,表示依存树上的一个节点[/b]
package hit_irlab.jni;
// 表示一个 依存树 上的节点
public class DependencyTreeNode {
public int idInDocument = 0;// 表示在文档中的id,从0开始
public int idInParagraph = 0;// 在段落中的 id,从0开始
public int idInSentence = 0;// 表示在句子中的id,从0开始
public String content = null;// 表示词的内容
public String pos = null;// 表示词性标注
public int parent = -2; // 表示指向该词的 节点的 idInSentence,-2表示没有任何词指向该词;-1表示句子的结束标记(EOS)
public String relate = null;// 表示两个词间的依存关系
public String toString(){
return content+"/"+pos + ",parent="+parent+",relate="+relate;
}
}
[b]2、 JAVA类:DependencyTree:用于访问依存树解析的JAVA类[/b]
package hit_irlab.jni;
import java.io.*;
import java.util.*;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
// 表示一棵依存树
public class DependencyTree {
//表示整个文档的依存树节点
// elements: ArrayList<>(段落构成的ArrayList)
// 段落 由 句子的 ArrayList 构成
// 每一个句子 ArrayList 由 DependencyTreeNode 组成
// ArrayList<ArrayList<ArrayList<DependencyTreeNode>>>
private ArrayList documentTreeNodeList = null;
public DependencyTree(File txtFile) {
documentTreeNodeList = getDocumentTreeNodesFromTxtFile(txtFile.
getAbsolutePath());
}
public DependencyTree(String str) {
documentTreeNodeList = getDocumentTreeNodesFromString(str);
}
// 取得段落列表
public ArrayList getParagraphList() {
return documentTreeNodeList;
}
// 取得句子列表
public ArrayList getSentenceList() {
ArrayList ret = new ArrayList();
for (int i = 0; i < documentTreeNodeList.size(); i++) {
// 按照段落循环
ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);
ret.addAll(paragraphList);
}
return ret;
}
// 取得所有节点列表
public ArrayList getTreeNodeList() {
ArrayList ret = new ArrayList();
for (int i = 0; i < documentTreeNodeList.size(); i++) {
// 按照段落循环
ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);
for (int j = 0; j < paragraphList.size(); j++) {
ArrayList sentenceList = (ArrayList) paragraphList.get(j);
ret.addAll(sentenceList);
}
}
return ret;
}
// 利用本地方法,获得一个 文档的所有 依存树节点列表
private native ArrayList getDocumentTreeNodesFromTxtFile(String filePath);
// 利用本地方法,获得一个 文档的所有 依存树节点列表
private native ArrayList getDocumentTreeNodesFromString(String str);
static {
System.loadLibrary("dependencyTreeJni");
}
/**
* 将得到的解析树,保存为 xml 文件
* @param xmlFile
* @throws IOException
*/
public void saveToXml(File xmlFile) throws IOException{
Element newroot = new Element("xml4nlp");
Document newdocument = new Document (newroot);
Element noteElement = new Element("note");
noteElement.setAttribute("sent","y");
noteElement.setAttribute("word","y");
noteElement.setAttribute("pos","y");
noteElement.setAttribute("ne","n");
noteElement.setAttribute("parser","y");
noteElement.setAttribute("wsd","n");
noteElement.setAttribute("srl","n");
noteElement.setAttribute("class","n");
noteElement.setAttribute("sum","n");
noteElement.setAttribute("cr","n");
newroot.addContent( noteElement );// 添加Note
Element newdoc = new Element("doc");
newroot.addContent(newdoc);
// System.out.println(documentTreeNodeList);
// 按照段落进行循环
for (int i = 0; i < documentTreeNodeList.size(); i++) {
ArrayList paragraphList = (ArrayList) documentTreeNodeList.get(i);
Element paraElement = new Element("para");
paraElement.setAttribute("id",""+i);
newdoc.addContent(paraElement);
/// 按照句子进行循环
for (int j = 0; j < paragraphList.size(); j++) {
ArrayList sentenceList = (ArrayList) paragraphList.get(j);
Element sentElement = new Element("sent");
sentElement.setAttribute("id",""+j);
paraElement.addContent(sentElement);
String sentC;
for (int k = 0; k < sentenceList.size(); k++) {
DependencyTreeNode node = (DependencyTreeNode) sentenceList.get(k);
Element word = new Element("word");
word.setAttribute("id",""+k);
word.setAttribute("cont",node.content);
word.setAttribute("pos",node.pos);
word.setAttribute("parent",""+node.parent);
word.setAttribute("relate",node.relate);
sentElement.addContent(word);
sentContent += node.content;
}
sentElement.setAttribute("cont",sentContent);
}
}
//输出这个xml文件
// System.out.println(""+ xmlFile.getAbsolutePath());
tool.XmlTool.OutputDocToFile(newdocument, "nlp_style.xsl","gb2312", xmlFile.getAbsolutePath());
}
/**
* 演示使用方法
* @param args String[]
* @throws IOException
*/
public static void usage() throws IOException {
// 可以使用文本文件,或者字符串构建 依存树
DependencyTree tree = new DependencyTree("The board forced him to resign.");// 使用字符串
// DependencyTree tree = new DependencyTree(new File("test.txt"));//使用文本文件
// ArrayList nodeList = tree.getTreeNodeList();
// for (int i = 0; i < nodeList.size(); i++) {
// DependencyTreeNode node = (DependencyTreeNode) nodeList.get(i);
// System.out.print("字符串内容:"+ node.content + "\t" );
// System.out.print("词性标注:"+node.pos + "\t");
// System.out.print("该节点在该句子中的id:" + node.idInSentence + "\t");
// System.out.print("指向该节点的节点在句子中的id:" + node.parent + "\t");
// System.out.println("两个节点之间的关系:"+node.relate);
// }
tree.saveToXml(new File("test_parse_en.xml"));
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
long start = System.currentTimeMillis();
usage();
long end = System.currentTimeMillis();
System.out.println("use time:"+(end-start)+" :ms");
// DependencyTree tree = new DependencyTree(new File("test.txt"));
// DependencyTree tree = new DependencyTree("我的名字是胡宝顺");
// System.out.println(tree.getTreeNodeList());
//
// System.out.println(tree.getSentenceList());
// System.out.println(tree.getParagraphList());
}
}
[b]3、 JAVA类:XmlTool:用于输出XML文件的类[/b]
package tool;
import java.io.FileWriter;
import java.io.IOException;
import org.jdom.Document;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
public class XmlTool {
public static void OutputDocToFile(Document doc, String xslpath,
String encoding, String filePath) throws IOException {
// setup this like outputDocument
Format format = Format.getPrettyFormat();
format.setEncoding(encoding);
XMLOutputter outputter =new XMLOutputter(format);
FileWriter out = new FileWriter(filePath);
//<?xml version="1.0" encoding="gb2312" ?>
out.write("<?xml version=\"1.0\"");
if (encoding != null && encoding.length() > 1) {
// 编码的名字至少有一个字符
out.write(" encoding=\"" + encoding + "\"");
}
out.write("?>");
out.write("\r\n");
if (xslpath != null && xslpath.length() > 3) {
// xsl 文件的名字至少有三个字符
out.write("<?xml-stylesheet type=\"text/xsl\" href=\"" + xslpath + "\"?>");
out.write("\r\n");
}
outputter.output(doc.getRootElement(), out);
out.close();
}
}
[b]4、 JNI的C/C++头文件:hit_irlab_jni_DependencyTree.h[/b]
/* DO NOT EDIT THIS FILE - it is machine generated */
#include <jni.h>
/* Header for class hit_irlab_jni_DependencyTree */
#ifndef _Included_hit_irlab_jni_DependencyTree
#define _Included_hit_irlab_jni_DependencyTree
#ifdef __cplusplus
extern "C" {
#endif
/*
* Class: hit_irlab_jni_DependencyTree
* Method: getDocumentTreeNodesFromTxtFile
* Signature: (Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromTxtFile
(JNIEnv *, jobject, jstring);
/*
* Class: hit_irlab_jni_DependencyTree
* Method: getDocumentTreeNodesFromString
* Signature: (Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromString
(JNIEnv *, jobject, jstring);
#ifdef __cplusplus
}
#endif
#endif
[b]5、 实现JNI 的DLL的核心CPP文件:dependencyTree.cpp[/b]
#include "__ltp_dll.h"
#pragma comment(lib, "__ltp_dll.lib")
#pragma warning(disable : 4786)
#include "hit_irlab_jni_DependencyTree.h"
#include <windows.h>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
#include <iostream>
#include <utility>
#include <map>
#include <fstream>
#include <conio.h>
#include <ctime>
using namespace std;
using namespace HIT_IR_LTP; // Important!
// 字符转换函数,解决中文字符乱码问题
jstring WindowsTojstring( JNIEnv* env, const char* str )
{
jstring rtn = 0;
int slen = strlen(str);
unsigned short * buffer = 0;
if( slen == 0 )
rtn = (env)->NewStringUTF(str );
else
{
int length = MultiByteToWideChar( CP_ACP, 0, (LPCSTR)str, slen, NULL, 0 );
buffer = (unsigned short *)malloc( length*2 + 1 );
if( MultiByteToWideChar( CP_ACP, 0, (LPCSTR)str, slen, (LPWSTR)buffer, length ) >0 )
rtn = (env)->NewString( (jchar*)buffer, length );
}
if( buffer )
free( buffer );
return rtn;
}
char* jstringToWindows( JNIEnv *env, jstring jstr )
{
int length = (env)->GetStringLength(jstr );
const jchar* jcstr = (env)->GetStringChars(jstr, 0 );
char* rtn = (char*)malloc( length*2+1 );
int size = 0;
size = WideCharToMultiByte( CP_ACP, 0, (LPCWSTR)jcstr, length, rtn,(length*2+1), NULL, NULL );
if( size <= 0 )
return NULL;
(env)->ReleaseStringChars(jstr, jcstr );
rtn[size] = 0;
return rtn;
}
// 构建依存树节点的声明
jobject constructDependencyTreeNode(JNIEnv *env,int idInDocument,int idInParagraph,
int idInSentence,const char *content,const char *pos,int parent,const char *relate );
//
jobject getDocumentTreeNodes
(JNIEnv *env, jobject obj, jstring instr, int sourceType );
/*
* Class: hit_irlab_jni_DependencyTree
* Method: getDocumentTreeNodesFromTxtFile
* Signature: (Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromTxtFile
(JNIEnv *env, jobject obj, jstring txtFileName){
return getDocumentTreeNodes(env,obj,txtFileName,1);
}
/*
* 构建依存树的核心函数
* int sourceType : 1: txt 文本文件; 2: 需要进行分析的字符串
*/
jobject getDocumentTreeNodes
(JNIEnv *env, jobject obj, jstring instr, int sourceType ){
const char * c_instr = jstringToWindows( env, instr );
env->ReleaseStringUTFChars( instr, c_instr );//释放传入的参数
// -------------------------
if ( sourceType == 1 ){
CreateDOMFromTxt( c_instr );
} else if ( sourceType == 2 ){
CreateDOMFromString( c_instr );
} else {
printf("wrong sourceType!\n");
return NULL;
}
Parser();
// 构建一个document arraylist
jclass class_ArrayList=env->FindClass("java/util/ArrayList");/* 获得Java类 */
jmethodID construct=env->GetMethodID( class_ArrayList, "<init>","()V");/* 获得构造方法 */
/* 获得List的add方法 */
jmethodID list_add=env->GetMethodID(class_ArrayList,"add","(Ljava/lang/Object;)Z");
jobject documentArrayList = env->NewObject( class_ArrayList, construct, "");/* 创建java对象 */
int idInDocument = 0;
int paraNum = CountParagraphInDocument();//按照文档中的段落进行循环
for (int k=0; k < paraNum; ++k)
{
// 构建一个段落 ArrayList
jobject paragraphArrayList = env->NewObject( class_ArrayList, construct, "");
env->CallObjectMethod(documentArrayList,list_add,paragraphArrayList); // 将段落添加到 document 上
int idInParagraph = 0;
int sentNum = CountSentenceInParagraph(k);
for (int j=0; j < sentNum; ++j)
{
// 构建一个句子 ArrayList
jobject sentenceArrayList = env->NewObject( class_ArrayList, construct, "");
env->CallObjectMethod(paragraphArrayList,list_add,sentenceArrayList); // 将句子添加到 段落 上
int wordNum = CountWordInSentence(k, j);
for (int i=0; i < wordNum; ++i)
{
pair<int, const char *> parent_relate;
int ret = GetParse(parent_relate, k, j, i);
if (0 == ret && parent_relate.second != NULL)
{
idInDocument++;
idInParagraph++;
int idInSentence = i;
const char *content = GetWord(k,j,i);
const char *pos = GetPOS(k,j,i);
int parent = parent_relate.first;
const char *relate = parent_relate.second;
jobject oneNode = constructDependencyTreeNode(env,idInDocument,idInParagraph,idInSentence,
content,pos,parent,relate);
/* 调用List 的add方法 */
env->CallObjectMethod(sentenceArrayList,list_add,oneNode);
}
}
}
}
return documentArrayList;
}
/*
* Class: hit_irlab_jni_DependencyTree
* Method: getDocumentTreeNodesFromString
* Signature: (Ljava/lang/String;)Ljava/util/ArrayList;
*/
JNIEXPORT jobject JNICALL Java_hit_1irlab_jni_DependencyTree_getDocumentTreeNodesFromString
(JNIEnv *env, jobject obj, jstring instr){
return getDocumentTreeNodes(env,obj,instr,2);
}
/*
* 构造treeNode对象
*/
jobject constructDependencyTreeNode(JNIEnv *env,int idInDocument,int idInParagraph,
int idInSentence,const char *content,const char *pos,int parent,const char *relate ){
/**************创建DependencyTreeNode对象 start*****************/
jclass class_treeNode=env->FindClass("hit_irlab/jni/DependencyTreeNode");/* 获得Java类 */
jmethodID construct_treeNode=env->GetMethodID( class_treeNode, "<init>","()V");/* 获得构造方法 */
jobject obj_treeNode =env->NewObject( class_treeNode, construct_treeNode, "");/* 创建java对象 */
/**************创建属性ID***************************/
jfieldID jcontent = env->GetFieldID(class_treeNode,"content","Ljava/lang/String;");
jfieldID jpos = env->GetFieldID(class_treeNode,"pos","Ljava/lang/String;");
jfieldID jrelate = env->GetFieldID(class_treeNode,"relate","Ljava/lang/String;");
jfieldID jidInDocument = env->GetFieldID(class_treeNode,"idInDocument","I");
jfieldID jidInParagraph = env->GetFieldID(class_treeNode,"idInParagraph","I");
jfieldID jidInSentence = env->GetFieldID(class_treeNode,"idInSentence","I");
jfieldID jparent = env->GetFieldID(class_treeNode,"parent","I");
/**************给对象的属性赋值*************************/
env->SetIntField(obj_treeNode,jidInDocument, idInDocument);
env->SetIntField(obj_treeNode,jidInParagraph, idInParagraph);
env->SetIntField(obj_treeNode,jidInSentence, idInSentence);
env->SetIntField(obj_treeNode,jparent, parent);
env->SetObjectField(obj_treeNode,jcontent, WindowsTojstring( env, content ) );
env->SetObjectField(obj_treeNode,jpos, env->NewStringUTF(pos) );
env->SetObjectField(obj_treeNode,jrelate, env->NewStringUTF(relate) );
return obj_treeNode;
}
[[i] 本帖最后由 neu-hubs 于 2007-7-5 11:35 编辑 [/i]]
太好了
谢谢搂主:victory:我正在做这个 这下省了不少力气
顶 感谢楼主的热心啊
我在按楼主文章做的时候出现了这样的问题 :open ltp config file err: ltp_all_modules.conf
最后发现ltp_all_modules.conf 文件没有找到的原因
是eclipse运行时将 user.dir 设置为工程所在的文件夹,所以我们应该将 ltp_data 文件夹和 *.conf 文件放到
工程所在的文件夹,dll放到 system32 下。
然后在eclipse界面中刷新一下整个工程,然后再运行,就不会出现错误了。:)
实在是感谢楼主热心的回复邮件^_^
.
留个脚印[img]http://www.he.xinhuanet.com/photo/2006-03/01/xinsrc_1803030109314842765630.gif[/img]
[img]http://www.duba2008.com.cn/sigline.gif[/img]
又用我的id灌水呀,你的呢?/都让我老爸用了![url=http://microsoft-office.net.cn/jinshanduba/][color=black]金山毒霸[/color][/url][url=http://microsoft-office.net.cn/office2007/][color=black]office2007[/color][/url][url=http://www.jinshan007.com/CiBa/google.html][color=black]谷歌金山词霸[/color][/url][url=http://huangzuan.qqwuba.com/][color=black]QQ黄钻[/color][/url][url=http://www1.duba2008.org.cn/][color=black]金山毒霸2008[/color][/url]
.
*** 作者被禁止或删除 内容自动屏蔽 *** 请问依存句法分析源代码在哪里能申请下载到?请楼主指教!页:
[1]