怎样用baidu作为工具分析商业对手案例

mfcai

浏览: 404595 次
性别:
来自: 北京

最近访客更多访客>>

cy_ygs

taodedao

loveucrazy

a136558

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

百合百度商业数据 java

商家对自己的商业数据应该有自己的保密措施,至少这些商业数据不应该被baidu搜索引擎抓取到,百合网号称实名婚恋网的开创者，实名认证，交友更诚信，姓名保护，隐私更安全。
但是我不知道连百度都能随便的获取到用户的隐私，这个隐私更安全做何解？
当我告知他的用户信息能被百度搜索到时,居然没有引起任何人的重视.通过非你莫属知道了百合,知道了慕岩.慕岩稳重干练,待人宽厚,从每期节目张绍刚老师总喜欢拿慕总开玩笑可以看得出来.但是手下人却没有这种稳重和干练.

扯远了，还是回到主题。通过用百度作为工具分析商业对手最重要的一点是关键词的使用
第一：筛选关键词。就是你想知道什么内容，大概的圈出个范围
第二：选定关键词。经过几次筛选之后，就可以选取比较重要的关键词
有了这些关键词，就能获得你想要的信息。再对这些信息入库、筛选、比对、挖掘、分析。
那么这些数据就可以辅助你进行决策了

还是以百合网为例，比如，在百度中输入：北京海淀征婚用户个人资料百合婚恋交友网。
我们能够搜到在该地区百合网注册的全部用户。然后将这些信息入库。将重复的数据进行过滤整合。再进行数据挖掘分析，经过这一系列的处理，我们可以分析出：
1)每个城市的百合的会员数
2)每个城市会员数的男女比例
3)百合有多少水晶会员
4)百合会员的学历、年龄分布
5)百合会员有多少通过认证
6）百合会员的收入情况

顺带用java做了个例子，把搜索的数据保存到本地。说明做这些工作不是很困难。
1、只搜索海淀区的前800条数据
2、把搜索引擎搜索到标题、摘要保存到本地文件。
3、做一个目录，每800条数据放到此目录下的一个文件中
当然如果往下引申，
可以把全国各地的数据以城市名为搜索条件，来一个遍历
可以搜索到数据再进行细分之后，保存到数据库中

package com.test;

import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
    static List urlList = new ArrayList() ;
    static List titleList = new ArrayList();
    static List contentList = new ArrayList();
//获得百度的搜索页面，前100个搜索结果
public String getHTML(String key,String rec_cnt) throws IOException
{
StringBuilder sb=new StringBuilder();
String path="http://www.baidu.com/s?tn=06008006_3_dg&lm=-1&wd="+URLEncoder.encode(key,"utf-8")+"&pn="+rec_cnt+"&ie=utf-8&rn=100";
System.out.println("搜索的url为："+path);
URL url=new URL(path);
BufferedReader breader=new BufferedReader(new InputStreamReader(url.openStream()));
String line=null;
while((line=breader.readLine())!=null)
{
   sb.append(new String(line.getBytes(),"utf-8"));
}
return sb.toString();
}

//对HTML进行析取，析取出URL、标题和摘要
public boolean parseHTML(String key,String rec_cnt)
{
String page=null;
try
{
   page=getHTML(key,rec_cnt);
}
catch(Exception ex)
{
   ex.printStackTrace();
}
        if(page == null)
         return false;
if(page!=null)
{
   String regx="";
   Pattern pattern=Pattern.compile(regx);
   Matcher matcher=pattern.matcher(page);
   int i =0;
      while(matcher.find())
      {
                if(i == 0){
                 i++;
                 continue;
                }

       //获得table中的数据
       String table_content=matcher.group().toString();

       String reg_URL="href=\"(.*?)\"";
                Pattern pattern_URL=Pattern.compile(reg_URL);
                Matcher matcher_URL=pattern_URL.matcher(table_content);
                String page_URL=null;
       if(matcher_URL.find())
       {
        page_URL=matcher_URL.group().toString();
       }
       page_URL=page_URL.substring(6);
       //得到了URL
       page_URL=page_URL.substring(0,page_URL.length()-1);
       String reg_title="(.+?)";
       Pattern patter_title=Pattern.compile(reg_title);
       Matcher matcher_title=patter_title.matcher(table_content);
       String page_title=null;
       if(matcher_title.find())
       {
        //得到了标题
        page_title=matcher_title.group().toString();
       }
       //从table_content中析取出正文
       String page_content = null;
       page_content = table_content.substring(table_content.lastIndexOf("")+5);

       urlList.add(page_URL);
       titleList.add(page_title);
       contentList.add(page_content);
          i++;
       }
     }
return true;
    }

public static void writeToFile(String javaListFile)//写入文件中
{
   BufferedWriter bf=null;
   try
   {
    bf=new BufferedWriter(new FileWriter(javaListFile));
    for(int i=0;i
     bf.write("URL:"+urlList.get(i));
     System.out.println("第"+(i+1)+"条结果：");
     System.out.println("URL:"+urlList.get(i));
     bf.newLine();
     bf.write("标题:"+titleList.get(i));
     bf.newLine();
     bf.write("摘要:"+contentList.get(i));
     bf.newLine();
     bf.newLine();
     bf.newLine();
     bf.flush();
    }



   }
   catch (IOException e)
   {
   new RuntimeException(e);
   }

   finally
   {
    try
    {
     if (bf!=null)
     {
      bf.close();
     }
    }
    catch(IOException e)
    {

     new RuntimeException(e);
    }

   }

}

public void saveHtml(String key){
try{
   String path="http://www.baidu.com/s?tn=ichuner&wd="+URLEncoder.encode(key,"utf-8")+"&ie=utf-8";
   System.out.println("搜索的url为："+path);
   URL url=new URL(path);
   InputStream in=new BufferedInputStream(url.openStream());
   InputStreamReader theHTML=new InputStreamReader(in);
   int c;
   File file=File.createTempFile("temp",".html");
   FileOutputStream fos=new FileOutputStream(file);
   BufferedOutputStream bos=new BufferedOutputStream(fos);
   OutputStreamWriter osw=new OutputStreamWriter(bos);
   while((c=theHTML.read())!=-1){
   osw.write(c);
   }
   Runtime.getRuntime().exec("explorer "+file.toString());
   }catch(MalformedURLException ex){
   System.err.println(ex);
   }catch(IOException ex){
   System.err.println(ex);
   }
}

public static void main(String[] args) {
     int page_count=0;
     Test test = new Test();
//     test.saveHtml("北京海淀征婚用户个人资料百合婚恋交友网");
     while(test.parseHTML("北京海淀征婚用户个人资料百合婚恋交友网",page_count+"") && page_count<800){
      test.writeToFile("e:\\haidingqu\\temp"+page_count+".txt");
      page_count+=100;
      urlList = new ArrayList() ;
      titleList = new ArrayList();
      contentList = new ArrayList();
     }
}
}

1
顶

2
踩

分享到：

深入了解android平台的jni---图像灰度化处 ... | 内核等待机制学习（一）

2013-04-22 03:44
浏览 1464
评论(0)
分类:行业应用
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论